From a5a18c85e039e7f369b7558308c62dca41c48206 Mon Sep 17 00:00:00 2001 From: Maximilian Michels Date: Thu, 31 Jul 2025 16:13:55 +0200 Subject: [PATCH 1/5] Flink: Move Flink v2.0 to v2.1 directory --- flink/{v2.0 => v2.1}/build.gradle | 0 flink/{v2.0 => v2.1}/flink-runtime/LICENSE | 0 flink/{v2.0 => v2.1}/flink-runtime/NOTICE | 0 .../java/org/apache/iceberg/flink/TestIcebergConnectorSmoke.java | 0 .../dynamic/DynamicRecordSerializerDeserializerBenchmark.java | 0 .../iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java | 0 .../flink/sink/shuffle/SketchRangePartitionerBenchmark.java | 0 .../flink/table/api/runtime/types/FlinkScalaKryoInstantiator.java | 0 .../src/main/java/org/apache/iceberg/flink/CatalogLoader.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkCatalog.java | 0 .../main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkConfParser.java | 0 .../main/java/org/apache/iceberg/flink/FlinkConfigOptions.java | 0 .../java/org/apache/iceberg/flink/FlinkCreateTableOptions.java | 0 .../java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java | 0 .../java/org/apache/iceberg/flink/FlinkEnvironmentContext.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkFilters.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkReadConf.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkRowData.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java | 0 .../src/main/java/org/apache/iceberg/flink/IcebergTableSink.java | 0 .../src/main/java/org/apache/iceberg/flink/RowDataWrapper.java | 0 .../flink/src/main/java/org/apache/iceberg/flink/TableLoader.java | 0 .../src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java | 0 .../src/main/java/org/apache/iceberg/flink/actions/Actions.java | 0 .../org/apache/iceberg/flink/actions/RewriteDataFilesAction.java | 0 .../org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java | 0 .../main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java | 0 .../main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java | 0 .../main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java | 0 .../main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java | 0 .../main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java | 0 .../java/org/apache/iceberg/flink/data/FlinkParquetReaders.java | 0 .../java/org/apache/iceberg/flink/data/FlinkParquetWriters.java | 0 .../org/apache/iceberg/flink/data/FlinkPlannedAvroReader.java | 0 .../java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java | 0 .../java/org/apache/iceberg/flink/data/FlinkValueReaders.java | 0 .../java/org/apache/iceberg/flink/data/FlinkValueWriters.java | 0 .../apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java | 0 .../java/org/apache/iceberg/flink/data/RowDataProjection.java | 0 .../src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java | 0 .../main/java/org/apache/iceberg/flink/data/StructRowData.java | 0 .../apache/iceberg/flink/maintenance/api/DeleteOrphanFiles.java | 0 .../org/apache/iceberg/flink/maintenance/api/ExpireSnapshots.java | 0 .../iceberg/flink/maintenance/api/FlinkMaintenanceConfig.java | 0 .../org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java | 0 .../java/org/apache/iceberg/flink/maintenance/api/LockConfig.java | 0 .../iceberg/flink/maintenance/api/MaintenanceTaskBuilder.java | 0 .../apache/iceberg/flink/maintenance/api/RewriteDataFiles.java | 0 .../iceberg/flink/maintenance/api/RewriteDataFilesConfig.java | 0 .../apache/iceberg/flink/maintenance/api/TableMaintenance.java | 0 .../java/org/apache/iceberg/flink/maintenance/api/TaskResult.java | 0 .../java/org/apache/iceberg/flink/maintenance/api/Trigger.java | 0 .../apache/iceberg/flink/maintenance/api/TriggerLockFactory.java | 0 .../org/apache/iceberg/flink/maintenance/api/ZkLockFactory.java | 0 .../flink/maintenance/operator/DataFileRewriteCommitter.java | 0 .../flink/maintenance/operator/DataFileRewritePlanner.java | 0 .../iceberg/flink/maintenance/operator/DataFileRewriteRunner.java | 0 .../iceberg/flink/maintenance/operator/DeleteFilesProcessor.java | 0 .../flink/maintenance/operator/ExpireSnapshotsProcessor.java | 0 .../apache/iceberg/flink/maintenance/operator/FileNameReader.java | 0 .../iceberg/flink/maintenance/operator/FileUriKeySelector.java | 0 .../iceberg/flink/maintenance/operator/ListFileSystemFiles.java | 0 .../iceberg/flink/maintenance/operator/ListMetadataFiles.java | 0 .../iceberg/flink/maintenance/operator/LockFactoryBuilder.java | 0 .../apache/iceberg/flink/maintenance/operator/LockRemover.java | 0 .../org/apache/iceberg/flink/maintenance/operator/LogUtil.java | 0 .../iceberg/flink/maintenance/operator/MetadataTablePlanner.java | 0 .../apache/iceberg/flink/maintenance/operator/MonitorSource.java | 0 .../iceberg/flink/maintenance/operator/OrphanFilesDetector.java | 0 .../flink/maintenance/operator/SingleThreadedIteratorSource.java | 0 .../apache/iceberg/flink/maintenance/operator/SkipOnError.java | 0 .../apache/iceberg/flink/maintenance/operator/TableChange.java | 0 .../flink/maintenance/operator/TableMaintenanceMetrics.java | 0 .../apache/iceberg/flink/maintenance/operator/TableReader.java | 0 .../iceberg/flink/maintenance/operator/TaskResultAggregator.java | 0 .../iceberg/flink/maintenance/operator/TriggerEvaluator.java | 0 .../apache/iceberg/flink/maintenance/operator/TriggerManager.java | 0 .../iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java | 0 .../java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java | 0 .../org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java | 0 .../java/org/apache/iceberg/flink/sink/BucketPartitioner.java | 0 .../java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java | 0 .../java/org/apache/iceberg/flink/sink/CachingTableSupplier.java | 0 .../main/java/org/apache/iceberg/flink/sink/CommitSummary.java | 0 .../iceberg/flink/sink/CommittableToTableChangeConverter.java | 0 .../main/java/org/apache/iceberg/flink/sink/DeltaManifests.java | 0 .../org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java | 0 .../org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java | 0 .../java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java | 0 .../org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java | 0 .../java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java | 0 .../src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java | 0 .../main/java/org/apache/iceberg/flink/sink/FlinkWriteResult.java | 0 .../java/org/apache/iceberg/flink/sink/IcebergCommittable.java | 0 .../apache/iceberg/flink/sink/IcebergCommittableSerializer.java | 0 .../main/java/org/apache/iceberg/flink/sink/IcebergCommitter.java | 0 .../java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java | 0 .../apache/iceberg/flink/sink/IcebergFilesCommitterFactory.java | 0 .../apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java | 0 .../src/main/java/org/apache/iceberg/flink/sink/IcebergSink.java | 0 .../java/org/apache/iceberg/flink/sink/IcebergSinkBuilder.java | 0 .../java/org/apache/iceberg/flink/sink/IcebergSinkWriter.java | 0 .../java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java | 0 .../org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java | 0 .../org/apache/iceberg/flink/sink/IcebergWriteAggregator.java | 0 .../org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java | 0 .../java/org/apache/iceberg/flink/sink/PartitionKeySelector.java | 0 .../org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java | 0 .../org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java | 0 .../src/main/java/org/apache/iceberg/flink/sink/SinkUtil.java | 0 .../java/org/apache/iceberg/flink/sink/TaskWriterFactory.java | 0 .../org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java | 0 .../java/org/apache/iceberg/flink/sink/WriteResultSerializer.java | 0 .../apache/iceberg/flink/sink/dynamic/CompareSchemasVisitor.java | 0 .../java/org/apache/iceberg/flink/sink/dynamic/DataConverter.java | 0 .../org/apache/iceberg/flink/sink/dynamic/DynamicCommittable.java | 0 .../iceberg/flink/sink/dynamic/DynamicCommittableSerializer.java | 0 .../org/apache/iceberg/flink/sink/dynamic/DynamicCommitter.java | 0 .../iceberg/flink/sink/dynamic/DynamicCommitterMetrics.java | 0 .../org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java | 0 .../java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java | 0 .../apache/iceberg/flink/sink/dynamic/DynamicRecordGenerator.java | 0 .../apache/iceberg/flink/sink/dynamic/DynamicRecordInternal.java | 0 .../flink/sink/dynamic/DynamicRecordInternalSerializer.java | 0 .../iceberg/flink/sink/dynamic/DynamicRecordInternalType.java | 0 .../apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java | 0 .../org/apache/iceberg/flink/sink/dynamic/DynamicSinkUtil.java | 0 .../iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java | 0 .../org/apache/iceberg/flink/sink/dynamic/DynamicWriteResult.java | 0 .../iceberg/flink/sink/dynamic/DynamicWriteResultAggregator.java | 0 .../iceberg/flink/sink/dynamic/DynamicWriteResultSerializer.java | 0 .../java/org/apache/iceberg/flink/sink/dynamic/DynamicWriter.java | 0 .../apache/iceberg/flink/sink/dynamic/DynamicWriterMetrics.java | 0 .../apache/iceberg/flink/sink/dynamic/EvolveSchemaVisitor.java | 0 .../org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java | 0 .../main/java/org/apache/iceberg/flink/sink/dynamic/LRUCache.java | 0 .../apache/iceberg/flink/sink/dynamic/PartitionSpecEvolution.java | 0 .../org/apache/iceberg/flink/sink/dynamic/TableMetadataCache.java | 0 .../apache/iceberg/flink/sink/dynamic/TableSerializerCache.java | 0 .../java/org/apache/iceberg/flink/sink/dynamic/TableUpdater.java | 0 .../java/org/apache/iceberg/flink/sink/dynamic/WriteTarget.java | 0 .../iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java | 0 .../apache/iceberg/flink/sink/shuffle/CompletedStatistics.java | 0 .../iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java | 0 .../org/apache/iceberg/flink/sink/shuffle/DataStatistics.java | 0 .../iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java | 0 .../flink/sink/shuffle/DataStatisticsCoordinatorProvider.java | 0 .../apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java | 0 .../iceberg/flink/sink/shuffle/DataStatisticsOperatorFactory.java | 0 .../iceberg/flink/sink/shuffle/DataStatisticsSerializer.java | 0 .../org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java | 0 .../iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java | 0 .../java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java | 0 .../java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java | 0 .../org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java | 0 .../apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java | 0 .../org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java | 0 .../iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java | 0 .../apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java | 0 .../apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java | 0 .../java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java | 0 .../org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java | 0 .../iceberg/flink/sink/shuffle/SortKeySketchSerializer.java | 0 .../java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java | 0 .../org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java | 0 .../org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java | 0 .../iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java | 0 .../flink/sink/shuffle/StatisticsOrRecordTypeInformation.java | 0 .../org/apache/iceberg/flink/sink/shuffle/StatisticsType.java | 0 .../org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java | 0 .../iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java | 0 .../main/java/org/apache/iceberg/flink/source/DataIterator.java | 0 .../main/java/org/apache/iceberg/flink/source/DataTaskReader.java | 0 .../java/org/apache/iceberg/flink/source/FileScanTaskReader.java | 0 .../java/org/apache/iceberg/flink/source/FlinkInputFormat.java | 0 .../java/org/apache/iceberg/flink/source/FlinkInputSplit.java | 0 .../main/java/org/apache/iceberg/flink/source/FlinkSource.java | 0 .../java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java | 0 .../main/java/org/apache/iceberg/flink/source/IcebergSource.java | 0 .../java/org/apache/iceberg/flink/source/IcebergTableSource.java | 0 .../apache/iceberg/flink/source/RowDataFileScanTaskReader.java | 0 .../java/org/apache/iceberg/flink/source/RowDataRewriter.java | 0 .../iceberg/flink/source/RowDataToAvroGenericRecordConverter.java | 0 .../main/java/org/apache/iceberg/flink/source/ScanContext.java | 0 .../src/main/java/org/apache/iceberg/flink/source/SourceUtil.java | 0 .../org/apache/iceberg/flink/source/StreamingMonitorFunction.java | 0 .../org/apache/iceberg/flink/source/StreamingReaderOperator.java | 0 .../apache/iceberg/flink/source/StreamingStartingStrategy.java | 0 .../iceberg/flink/source/assigner/DefaultSplitAssigner.java | 0 .../org/apache/iceberg/flink/source/assigner/GetSplitResult.java | 0 .../flink/source/assigner/OrderedSplitAssignerFactory.java | 0 .../iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java | 0 .../org/apache/iceberg/flink/source/assigner/SplitAssigner.java | 0 .../iceberg/flink/source/assigner/SplitAssignerFactory.java | 0 .../apache/iceberg/flink/source/assigner/SplitAssignerType.java | 0 .../flink/source/enumerator/AbstractIcebergEnumerator.java | 0 .../flink/source/enumerator/ContinuousEnumerationResult.java | 0 .../flink/source/enumerator/ContinuousIcebergEnumerator.java | 0 .../iceberg/flink/source/enumerator/ContinuousSplitPlanner.java | 0 .../flink/source/enumerator/ContinuousSplitPlannerImpl.java | 0 .../iceberg/flink/source/enumerator/EnumerationHistory.java | 0 .../flink/source/enumerator/IcebergEnumeratorPosition.java | 0 .../source/enumerator/IcebergEnumeratorPositionSerializer.java | 0 .../iceberg/flink/source/enumerator/IcebergEnumeratorState.java | 0 .../flink/source/enumerator/IcebergEnumeratorStateSerializer.java | 0 .../iceberg/flink/source/enumerator/StaticIcebergEnumerator.java | 0 .../org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java | 0 .../iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java | 0 .../iceberg/flink/source/reader/AvroGenericRecordConverter.java | 0 .../flink/source/reader/AvroGenericRecordReaderFunction.java | 0 .../flink/source/reader/ColumnStatsWatermarkExtractor.java | 0 .../iceberg/flink/source/reader/ConverterReaderFunction.java | 0 .../apache/iceberg/flink/source/reader/DataIteratorBatcher.java | 0 .../iceberg/flink/source/reader/DataIteratorReaderFunction.java | 0 .../apache/iceberg/flink/source/reader/IcebergSourceReader.java | 0 .../iceberg/flink/source/reader/IcebergSourceReaderMetrics.java | 0 .../iceberg/flink/source/reader/IcebergSourceSplitReader.java | 0 .../apache/iceberg/flink/source/reader/LimitableDataIterator.java | 0 .../org/apache/iceberg/flink/source/reader/ListBatchRecords.java | 0 .../iceberg/flink/source/reader/ListDataIteratorBatcher.java | 0 .../iceberg/flink/source/reader/MetaDataReaderFunction.java | 0 .../org/apache/iceberg/flink/source/reader/ReaderFunction.java | 0 .../org/apache/iceberg/flink/source/reader/RecordAndPosition.java | 0 .../org/apache/iceberg/flink/source/reader/RecordFactory.java | 0 .../org/apache/iceberg/flink/source/reader/RecordLimiter.java | 0 .../java/org/apache/iceberg/flink/source/reader/RowConverter.java | 0 .../org/apache/iceberg/flink/source/reader/RowDataConverter.java | 0 .../apache/iceberg/flink/source/reader/RowDataReaderFunction.java | 0 .../apache/iceberg/flink/source/reader/RowDataRecordFactory.java | 0 .../iceberg/flink/source/reader/SerializableRecordEmitter.java | 0 .../iceberg/flink/source/reader/SplitWatermarkExtractor.java | 0 .../flink/source/reader/WatermarkExtractorRecordEmitter.java | 0 .../org/apache/iceberg/flink/source/split/IcebergSourceSplit.java | 0 .../iceberg/flink/source/split/IcebergSourceSplitSerializer.java | 0 .../iceberg/flink/source/split/IcebergSourceSplitState.java | 0 .../iceberg/flink/source/split/IcebergSourceSplitStatus.java | 0 .../apache/iceberg/flink/source/split/SerializableComparator.java | 0 .../org/apache/iceberg/flink/source/split/SerializerHelper.java | 0 .../org/apache/iceberg/flink/source/split/SplitComparators.java | 0 .../org/apache/iceberg/flink/source/split/SplitRequestEvent.java | 0 .../main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java | 0 .../java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java | 0 .../org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java | 0 .../src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java | 0 .../META-INF/services/org.apache.flink.table.factories.Factory | 0 .../org/apache/iceberg/flink/AvroGenericRecordConverterBase.java | 0 .../src/test/java/org/apache/iceberg/flink/CatalogTestBase.java | 0 .../src/test/java/org/apache/iceberg/flink/DataGenerator.java | 0 .../src/test/java/org/apache/iceberg/flink/DataGenerators.java | 0 .../java/org/apache/iceberg/flink/HadoopCatalogExtension.java | 0 .../test/java/org/apache/iceberg/flink/HadoopTableExtension.java | 0 .../java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java | 0 .../src/test/java/org/apache/iceberg/flink/RowDataConverter.java | 0 .../src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java | 0 .../flink/src/test/java/org/apache/iceberg/flink/SqlBase.java | 0 .../flink/src/test/java/org/apache/iceberg/flink/TestBase.java | 0 .../src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java | 0 .../java/org/apache/iceberg/flink/TestCatalogTableLoader.java | 0 .../test/java/org/apache/iceberg/flink/TestChangeLogTable.java | 0 .../java/org/apache/iceberg/flink/TestDataFileSerialization.java | 0 .../src/test/java/org/apache/iceberg/flink/TestFixtures.java | 0 .../java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java | 0 .../java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java | 0 .../java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java | 0 .../test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java | 0 .../org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java | 0 .../test/java/org/apache/iceberg/flink/TestFlinkConfParser.java | 0 .../src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java | 0 .../test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java | 0 .../test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java | 0 .../test/java/org/apache/iceberg/flink/TestFlinkTableSink.java | 0 .../org/apache/iceberg/flink/TestFlinkTableSinkCompaction.java | 0 .../java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java | 0 .../src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java | 0 .../flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java | 0 .../test/java/org/apache/iceberg/flink/TestIcebergConnector.java | 0 .../org/apache/iceberg/flink/TestManifestFileSerialization.java | 0 .../test/java/org/apache/iceberg/flink/TestRowDataWrapper.java | 0 .../src/test/java/org/apache/iceberg/flink/TestTableLoader.java | 0 .../java/org/apache/iceberg/flink/TestTableSerialization.java | 0 .../apache/iceberg/flink/actions/TestRewriteDataFilesAction.java | 0 .../test/java/org/apache/iceberg/flink/data/RandomRowData.java | 0 .../java/org/apache/iceberg/flink/data/RowDataToRowMapper.java | 0 .../org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java | 0 .../org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java | 0 .../org/apache/iceberg/flink/data/TestFlinkParquetReader.java | 0 .../org/apache/iceberg/flink/data/TestFlinkParquetWriter.java | 0 .../java/org/apache/iceberg/flink/data/TestRowDataProjection.java | 0 .../java/org/apache/iceberg/flink/data/TestRowProjection.java | 0 .../java/org/apache/iceberg/flink/data/TestStructRowData.java | 0 .../flink/maintenance/api/MaintenanceTaskInfraExtension.java | 0 .../iceberg/flink/maintenance/api/MaintenanceTaskTestBase.java | 0 .../iceberg/flink/maintenance/api/TestDeleteOrphanFiles.java | 0 .../apache/iceberg/flink/maintenance/api/TestExpireSnapshots.java | 0 .../apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java | 0 .../apache/iceberg/flink/maintenance/api/TestLockFactoryBase.java | 0 .../apache/iceberg/flink/maintenance/api/TestMaintenanceE2E.java | 0 .../iceberg/flink/maintenance/api/TestRewriteDataFiles.java | 0 .../iceberg/flink/maintenance/api/TestRewriteDataFilesConfig.java | 0 .../iceberg/flink/maintenance/api/TestTableMaintenance.java | 0 .../apache/iceberg/flink/maintenance/api/TestZkLockFactory.java | 0 .../apache/iceberg/flink/maintenance/operator/CollectingSink.java | 0 .../apache/iceberg/flink/maintenance/operator/ManualSource.java | 0 .../maintenance/operator/MetricsReporterFactoryForTests.java | 0 .../iceberg/flink/maintenance/operator/OperatorTestBase.java | 0 .../apache/iceberg/flink/maintenance/operator/RewriteUtil.java | 0 .../flink/maintenance/operator/TestDataFileRewriteCommitter.java | 0 .../flink/maintenance/operator/TestDataFileRewritePlanner.java | 0 .../flink/maintenance/operator/TestDataFileRewriteRunner.java | 0 .../flink/maintenance/operator/TestDeleteFilesProcessor.java | 0 .../flink/maintenance/operator/TestExpireSnapshotsProcessor.java | 0 .../flink/maintenance/operator/TestListFileSystemFiles.java | 0 .../iceberg/flink/maintenance/operator/TestListMetadataFiles.java | 0 .../apache/iceberg/flink/maintenance/operator/TestLockConfig.java | 0 .../flink/maintenance/operator/TestLockFactoryBuilder.java | 0 .../iceberg/flink/maintenance/operator/TestLockRemover.java | 0 .../iceberg/flink/maintenance/operator/TestMonitorSource.java | 0 .../flink/maintenance/operator/TestOrphanFilesDetector.java | 0 .../iceberg/flink/maintenance/operator/TestSkipOnError.java | 0 .../flink/maintenance/operator/TestTablePlanerAndReader.java | 0 .../flink/maintenance/operator/TestTaskResultAggregator.java | 0 .../iceberg/flink/maintenance/operator/TestTriggerManager.java | 0 .../src/test/java/org/apache/iceberg/flink/sink/SinkTestUtil.java | 0 .../iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java | 0 .../apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java | 0 .../java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java | 0 .../iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java | 0 .../org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java | 0 .../org/apache/iceberg/flink/sink/TestCachingTableSupplier.java | 0 .../iceberg/flink/sink/TestCommittableToTableChangeConverter.java | 0 .../org/apache/iceberg/flink/sink/TestCompressionSettings.java | 0 .../java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java | 0 .../org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java | 0 .../org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java | 0 .../java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java | 0 .../org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java | 0 .../org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java | 0 .../iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java | 0 .../apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java | 0 .../sink/TestFlinkIcebergSinkRangeDistributionBucketing.java | 0 .../org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java | 0 .../org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java | 0 .../apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java | 0 .../flink/sink/TestFlinkIcebergSinkV2DistributionMode.java | 0 .../java/org/apache/iceberg/flink/sink/TestFlinkManifest.java | 0 .../apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java | 0 .../apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java | 0 .../apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java | 0 .../org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java | 0 .../java/org/apache/iceberg/flink/sink/TestIcebergCommitter.java | 0 .../org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java | 0 .../test/java/org/apache/iceberg/flink/sink/TestIcebergSink.java | 0 .../java/org/apache/iceberg/flink/sink/TestIcebergSinkBranch.java | 0 .../org/apache/iceberg/flink/sink/TestIcebergSinkCompact.java | 0 .../java/org/apache/iceberg/flink/sink/TestIcebergSinkV2.java | 0 .../org/apache/iceberg/flink/sink/TestIcebergSinkV2Branch.java | 0 .../org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java | 0 .../org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java | 0 .../test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java | 0 .../sink/dynamic/DynamicRecordInternalSerializerTestBase.java | 0 .../iceberg/flink/sink/dynamic/TestCompareSchemasVisitor.java | 0 .../flink/sink/dynamic/TestDynamicCommittableSerializer.java | 0 .../apache/iceberg/flink/sink/dynamic/TestDynamicCommitter.java | 0 .../apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java | 0 .../iceberg/flink/sink/dynamic/TestDynamicIcebergSinkPerf.java | 0 .../dynamic/TestDynamicRecordInternalSerializerWriteSchema.java | 0 .../dynamic/TestDynamicRecordInternalSerializerWriteSchemaId.java | 0 .../flink/sink/dynamic/TestDynamicTableUpdateOperator.java | 0 .../flink/sink/dynamic/TestDynamicWriteResultAggregator.java | 0 .../flink/sink/dynamic/TestDynamicWriteResultSerializer.java | 0 .../org/apache/iceberg/flink/sink/dynamic/TestDynamicWriter.java | 0 .../iceberg/flink/sink/dynamic/TestEvolveSchemaVisitor.java | 0 .../apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java | 0 .../java/org/apache/iceberg/flink/sink/dynamic/TestLRUCache.java | 0 .../iceberg/flink/sink/dynamic/TestPartitionSpecEvolution.java | 0 .../apache/iceberg/flink/sink/dynamic/TestRowDataConverter.java | 0 .../apache/iceberg/flink/sink/dynamic/TestTableMetadataCache.java | 0 .../iceberg/flink/sink/dynamic/TestTableSerializerCache.java | 0 .../org/apache/iceberg/flink/sink/dynamic/TestTableUpdater.java | 0 .../apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java | 0 .../test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java | 0 .../flink/sink/shuffle/TestAggregatedStatisticsTracker.java | 0 .../flink/sink/shuffle/TestCompletedStatisticsSerializer.java | 0 .../iceberg/flink/sink/shuffle/TestDataDistributionUtil.java | 0 .../iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java | 0 .../flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java | 0 .../iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java | 0 .../iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java | 0 .../flink/sink/shuffle/TestGlobalStatisticsSerializer.java | 0 .../apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java | 0 .../iceberg/flink/sink/shuffle/TestMapRangePartitioner.java | 0 .../apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java | 0 .../iceberg/flink/sink/shuffle/TestRangePartitionerSkew.java | 0 .../iceberg/flink/sink/shuffle/TestSketchDataStatistics.java | 0 .../iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java | 0 .../org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java | 0 .../iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java | 0 .../flink/sink/shuffle/TestSortKeySerializerNestedStruct.java | 0 .../flink/sink/shuffle/TestSortKeySerializerPrimitives.java | 0 .../iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java | 0 .../org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java | 0 .../flink/sink/shuffle/TestStatisticsOrRecordTypeInformation.java | 0 .../java/org/apache/iceberg/flink/source/BoundedTableFactory.java | 0 .../java/org/apache/iceberg/flink/source/BoundedTestSource.java | 0 .../org/apache/iceberg/flink/source/ChangeLogTableTestBase.java | 0 .../test/java/org/apache/iceberg/flink/source/SplitHelpers.java | 0 .../src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java | 0 .../java/org/apache/iceberg/flink/source/TableSourceTestBase.java | 0 .../org/apache/iceberg/flink/source/TestBoundedTableFactory.java | 0 .../org/apache/iceberg/flink/source/TestFlinkInputFormat.java | 0 .../iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java | 0 .../org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java | 0 .../org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java | 0 .../apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java | 0 .../test/java/org/apache/iceberg/flink/source/TestFlinkScan.java | 0 .../java/org/apache/iceberg/flink/source/TestFlinkScanSql.java | 0 .../java/org/apache/iceberg/flink/source/TestFlinkSource.java | 0 .../org/apache/iceberg/flink/source/TestFlinkSourceConfig.java | 0 .../java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java | 0 .../org/apache/iceberg/flink/source/TestFlinkTableSource.java | 0 .../org/apache/iceberg/flink/source/TestIcebergSourceBounded.java | 0 .../flink/source/TestIcebergSourceBoundedConverterBase.java | 0 .../flink/source/TestIcebergSourceBoundedGenericRecord.java | 0 .../apache/iceberg/flink/source/TestIcebergSourceBoundedRow.java | 0 .../apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java | 0 .../apache/iceberg/flink/source/TestIcebergSourceContinuous.java | 0 .../apache/iceberg/flink/source/TestIcebergSourceFailover.java | 0 .../source/TestIcebergSourceFailoverWithWatermarkExtractor.java | 0 .../iceberg/flink/source/TestIcebergSourceInferParallelism.java | 0 .../iceberg/flink/source/TestIcebergSourceReaderDeletes.java | 0 .../org/apache/iceberg/flink/source/TestIcebergSourceSql.java | 0 .../flink/source/TestIcebergSourceWithWatermarkExtractor.java | 0 .../flink/source/TestIcebergSpeculativeExecutionSupport.java | 0 .../iceberg/flink/source/TestMetadataTableReadableMetrics.java | 0 .../org/apache/iceberg/flink/source/TestProjectMetaColumn.java | 0 .../flink/source/TestRowDataToAvroGenericRecordConverter.java | 0 .../java/org/apache/iceberg/flink/source/TestScanContext.java | 0 .../test/java/org/apache/iceberg/flink/source/TestSourceUtil.java | 0 .../test/java/org/apache/iceberg/flink/source/TestSqlBase.java | 0 .../java/org/apache/iceberg/flink/source/TestStreamScanSql.java | 0 .../apache/iceberg/flink/source/TestStreamingMonitorFunction.java | 0 .../apache/iceberg/flink/source/TestStreamingReaderOperator.java | 0 .../iceberg/flink/source/assigner/SplitAssignerTestBase.java | 0 .../iceberg/flink/source/assigner/TestDefaultSplitAssigner.java | 0 .../source/assigner/TestFileSequenceNumberBasedSplitAssigner.java | 0 .../flink/source/assigner/TestWatermarkBasedSplitAssigner.java | 0 .../flink/source/enumerator/ManualContinuousSplitPlanner.java | 0 .../flink/source/enumerator/TestContinuousIcebergEnumerator.java | 0 .../flink/source/enumerator/TestContinuousSplitPlannerImpl.java | 0 .../enumerator/TestContinuousSplitPlannerImplStartStrategy.java | 0 .../iceberg/flink/source/enumerator/TestEnumerationHistory.java | 0 .../source/enumerator/TestIcebergEnumeratorStateSerializer.java | 0 .../iceberg/flink/source/reader/ReaderFunctionTestBase.java | 0 .../java/org/apache/iceberg/flink/source/reader/ReaderUtil.java | 0 .../apache/iceberg/flink/source/reader/TestArrayBatchRecords.java | 0 .../source/reader/TestArrayPoolDataIteratorBatcherRowData.java | 0 .../flink/source/reader/TestColumnStatsWatermarkExtractor.java | 0 .../iceberg/flink/source/reader/TestIcebergSourceReader.java | 0 .../iceberg/flink/source/reader/TestLimitableDataIterator.java | 0 .../iceberg/flink/source/reader/TestRowDataReaderFunction.java | 0 .../apache/iceberg/flink/source/reader/TestingMetricGroup.java | 0 .../flink/source/split/TestIcebergSourceSplitSerializer.java | 0 .../test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java | 0 .../org.apache.flink.metrics.reporter.MetricReporterFactory | 0 .../META-INF/services/org.apache.flink.table.factories.Factory | 0 473 files changed, 0 insertions(+), 0 deletions(-) rename flink/{v2.0 => v2.1}/build.gradle (100%) rename flink/{v2.0 => v2.1}/flink-runtime/LICENSE (100%) rename flink/{v2.0 => v2.1}/flink-runtime/NOTICE (100%) rename flink/{v2.0 => v2.1}/flink-runtime/src/integration/java/org/apache/iceberg/flink/TestIcebergConnectorSmoke.java (100%) rename flink/{v2.0 => v2.1}/flink/src/jmh/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordSerializerDeserializerBenchmark.java (100%) rename flink/{v2.0 => v2.1}/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java (100%) rename flink/{v2.0 => v2.1}/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitionerBenchmark.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/flink/table/api/runtime/types/FlinkScalaKryoInstantiator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/FlinkCreateTableOptions.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/FlinkRowData.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkPlannedAvroReader.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/DeleteOrphanFiles.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ExpireSnapshots.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/FlinkMaintenanceConfig.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/LockConfig.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskBuilder.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFilesConfig.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TableMaintenance.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TaskResult.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/Trigger.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TriggerLockFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ZkLockFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteCommitter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteRunner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DeleteFilesProcessor.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ExpireSnapshotsProcessor.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileNameReader.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileUriKeySelector.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListFileSystemFiles.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListMetadataFiles.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockFactoryBuilder.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockRemover.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LogUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MetadataTablePlanner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/OrphanFilesDetector.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SkipOnError.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableMaintenanceMetrics.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableReader.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TaskResultAggregator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerEvaluator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerManager.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/CommittableToTableChangeConverter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkWriteResult.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittable.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittableSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommitter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSink.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkBuilder.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkWriter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergWriteAggregator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/SinkUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/WriteResultSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/CompareSchemasVisitor.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DataConverter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittable.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittableSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitterMetrics.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordGenerator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternal.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalType.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicSinkUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResult.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultAggregator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriterMetrics.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/EvolveSchemaVisitor.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/LRUCache.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/PartitionSpecEvolution.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableMetadataCache.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableSerializerCache.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableUpdater.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/WriteTarget.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperatorFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordTypeInformation.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/ConverterReaderFunction.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowConverter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataConverter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java (100%) rename flink/{v2.0 => v2.1}/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestBase.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkCompaction.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskInfraExtension.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskTestBase.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestDeleteOrphanFiles.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestExpireSnapshots.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestLockFactoryBase.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestMaintenanceE2E.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFilesConfig.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestTableMaintenance.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestZkLockFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/MetricsReporterFactoryForTests.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteCommitter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDeleteFilesProcessor.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestExpireSnapshotsProcessor.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListFileSystemFiles.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListMetadataFiles.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockConfig.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockFactoryBuilder.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockRemover.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestOrphanFilesDetector.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestSkipOnError.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTablePlanerAndReader.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTaskResultAggregator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTriggerManager.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/SinkTestUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestCommittableToTableChangeConverter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkRangeDistributionBucketing.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2DistributionMode.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergCommitter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSink.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkBranch.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkCompact.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2Branch.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializerTestBase.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestCompareSchemasVisitor.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommittableSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommitter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSinkPerf.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchema.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchemaId.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultAggregator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestEvolveSchemaVisitor.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestLRUCache.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestPartitionSpecEvolution.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestRowDataConverter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableMetadataCache.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableSerializerCache.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableUpdater.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataDistributionUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitionerSkew.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestStatisticsOrRecordTypeInformation.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedConverterBase.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedRow.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceInferParallelism.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java (100%) rename flink/{v2.0 => v2.1}/flink/src/test/resources/META-INF/services/org.apache.flink.metrics.reporter.MetricReporterFactory (100%) rename flink/{v2.0 => v2.1}/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory (100%) diff --git a/flink/v2.0/build.gradle b/flink/v2.1/build.gradle similarity index 100% rename from flink/v2.0/build.gradle rename to flink/v2.1/build.gradle diff --git a/flink/v2.0/flink-runtime/LICENSE b/flink/v2.1/flink-runtime/LICENSE similarity index 100% rename from flink/v2.0/flink-runtime/LICENSE rename to flink/v2.1/flink-runtime/LICENSE diff --git a/flink/v2.0/flink-runtime/NOTICE b/flink/v2.1/flink-runtime/NOTICE similarity index 100% rename from flink/v2.0/flink-runtime/NOTICE rename to flink/v2.1/flink-runtime/NOTICE diff --git a/flink/v2.0/flink-runtime/src/integration/java/org/apache/iceberg/flink/TestIcebergConnectorSmoke.java b/flink/v2.1/flink-runtime/src/integration/java/org/apache/iceberg/flink/TestIcebergConnectorSmoke.java similarity index 100% rename from flink/v2.0/flink-runtime/src/integration/java/org/apache/iceberg/flink/TestIcebergConnectorSmoke.java rename to flink/v2.1/flink-runtime/src/integration/java/org/apache/iceberg/flink/TestIcebergConnectorSmoke.java diff --git a/flink/v2.0/flink/src/jmh/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordSerializerDeserializerBenchmark.java b/flink/v2.1/flink/src/jmh/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordSerializerDeserializerBenchmark.java similarity index 100% rename from flink/v2.0/flink/src/jmh/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordSerializerDeserializerBenchmark.java rename to flink/v2.1/flink/src/jmh/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordSerializerDeserializerBenchmark.java diff --git a/flink/v2.0/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java b/flink/v2.1/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java similarity index 100% rename from flink/v2.0/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java rename to flink/v2.1/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java diff --git a/flink/v2.0/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitionerBenchmark.java b/flink/v2.1/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitionerBenchmark.java similarity index 100% rename from flink/v2.0/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitionerBenchmark.java rename to flink/v2.1/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitionerBenchmark.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/flink/table/api/runtime/types/FlinkScalaKryoInstantiator.java b/flink/v2.1/flink/src/main/java/org/apache/flink/table/api/runtime/types/FlinkScalaKryoInstantiator.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/flink/table/api/runtime/types/FlinkScalaKryoInstantiator.java rename to flink/v2.1/flink/src/main/java/org/apache/flink/table/api/runtime/types/FlinkScalaKryoInstantiator.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkCreateTableOptions.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkCreateTableOptions.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkCreateTableOptions.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkCreateTableOptions.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkRowData.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkRowData.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkRowData.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkRowData.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkPlannedAvroReader.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkPlannedAvroReader.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkPlannedAvroReader.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkPlannedAvroReader.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/DeleteOrphanFiles.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/DeleteOrphanFiles.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/DeleteOrphanFiles.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/DeleteOrphanFiles.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ExpireSnapshots.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ExpireSnapshots.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ExpireSnapshots.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ExpireSnapshots.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/FlinkMaintenanceConfig.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/FlinkMaintenanceConfig.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/FlinkMaintenanceConfig.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/FlinkMaintenanceConfig.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/LockConfig.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/LockConfig.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/LockConfig.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/LockConfig.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskBuilder.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskBuilder.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskBuilder.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskBuilder.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFilesConfig.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFilesConfig.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFilesConfig.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFilesConfig.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TableMaintenance.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TableMaintenance.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TableMaintenance.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TableMaintenance.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TaskResult.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TaskResult.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TaskResult.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TaskResult.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/Trigger.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/Trigger.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/Trigger.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/Trigger.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TriggerLockFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TriggerLockFactory.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TriggerLockFactory.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TriggerLockFactory.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ZkLockFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ZkLockFactory.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ZkLockFactory.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ZkLockFactory.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteCommitter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteCommitter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteCommitter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteCommitter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteRunner.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteRunner.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteRunner.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteRunner.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DeleteFilesProcessor.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DeleteFilesProcessor.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DeleteFilesProcessor.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DeleteFilesProcessor.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ExpireSnapshotsProcessor.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ExpireSnapshotsProcessor.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ExpireSnapshotsProcessor.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ExpireSnapshotsProcessor.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileNameReader.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileNameReader.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileNameReader.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileNameReader.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileUriKeySelector.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileUriKeySelector.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileUriKeySelector.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileUriKeySelector.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListFileSystemFiles.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListFileSystemFiles.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListFileSystemFiles.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListFileSystemFiles.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListMetadataFiles.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListMetadataFiles.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListMetadataFiles.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListMetadataFiles.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockFactoryBuilder.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockFactoryBuilder.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockFactoryBuilder.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockFactoryBuilder.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockRemover.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockRemover.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockRemover.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockRemover.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LogUtil.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LogUtil.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LogUtil.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LogUtil.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MetadataTablePlanner.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MetadataTablePlanner.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MetadataTablePlanner.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MetadataTablePlanner.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/OrphanFilesDetector.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/OrphanFilesDetector.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/OrphanFilesDetector.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/OrphanFilesDetector.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SkipOnError.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SkipOnError.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SkipOnError.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SkipOnError.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableMaintenanceMetrics.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableMaintenanceMetrics.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableMaintenanceMetrics.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableMaintenanceMetrics.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableReader.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableReader.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableReader.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableReader.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TaskResultAggregator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TaskResultAggregator.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TaskResultAggregator.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TaskResultAggregator.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerEvaluator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerEvaluator.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerEvaluator.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerEvaluator.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerManager.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerManager.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerManager.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerManager.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/CommittableToTableChangeConverter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/CommittableToTableChangeConverter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/CommittableToTableChangeConverter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/CommittableToTableChangeConverter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkWriteResult.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkWriteResult.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkWriteResult.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkWriteResult.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittable.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittable.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittable.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittable.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittableSerializer.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittableSerializer.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittableSerializer.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittableSerializer.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommitter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommitter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommitter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommitter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterFactory.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterFactory.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterFactory.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSink.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSink.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSink.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSink.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkBuilder.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkBuilder.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkBuilder.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkBuilder.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkWriter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkWriter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkWriter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkWriter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergWriteAggregator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergWriteAggregator.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergWriteAggregator.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergWriteAggregator.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/SinkUtil.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/SinkUtil.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/SinkUtil.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/SinkUtil.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/WriteResultSerializer.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/WriteResultSerializer.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/WriteResultSerializer.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/WriteResultSerializer.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/CompareSchemasVisitor.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/CompareSchemasVisitor.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/CompareSchemasVisitor.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/CompareSchemasVisitor.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DataConverter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DataConverter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DataConverter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DataConverter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittable.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittable.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittable.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittable.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittableSerializer.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittableSerializer.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittableSerializer.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittableSerializer.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitterMetrics.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitterMetrics.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitterMetrics.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitterMetrics.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordGenerator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordGenerator.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordGenerator.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordGenerator.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternal.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternal.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternal.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternal.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializer.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializer.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializer.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializer.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalType.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalType.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalType.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalType.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicSinkUtil.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicSinkUtil.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicSinkUtil.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicSinkUtil.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResult.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResult.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResult.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResult.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultAggregator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultAggregator.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultAggregator.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultAggregator.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultSerializer.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultSerializer.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultSerializer.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultSerializer.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriterMetrics.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriterMetrics.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriterMetrics.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriterMetrics.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/EvolveSchemaVisitor.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/EvolveSchemaVisitor.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/EvolveSchemaVisitor.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/EvolveSchemaVisitor.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/LRUCache.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/LRUCache.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/LRUCache.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/LRUCache.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/PartitionSpecEvolution.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/PartitionSpecEvolution.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/PartitionSpecEvolution.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/PartitionSpecEvolution.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableMetadataCache.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableMetadataCache.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableMetadataCache.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableMetadataCache.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableSerializerCache.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableSerializerCache.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableSerializerCache.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableSerializerCache.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableUpdater.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableUpdater.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableUpdater.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableUpdater.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/WriteTarget.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/WriteTarget.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/WriteTarget.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/WriteTarget.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperatorFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperatorFactory.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperatorFactory.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperatorFactory.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordTypeInformation.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordTypeInformation.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordTypeInformation.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordTypeInformation.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ConverterReaderFunction.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/ConverterReaderFunction.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ConverterReaderFunction.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/ConverterReaderFunction.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowConverter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowConverter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowConverter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowConverter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataConverter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataConverter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataConverter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataConverter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java similarity index 100% rename from flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java rename to flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java diff --git a/flink/v2.0/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory b/flink/v2.1/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory similarity index 100% rename from flink/v2.0/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory rename to flink/v2.1/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestBase.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestBase.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestBase.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkCompaction.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkCompaction.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkCompaction.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkCompaction.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskInfraExtension.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskInfraExtension.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskInfraExtension.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskInfraExtension.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskTestBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskTestBase.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskTestBase.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskTestBase.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestDeleteOrphanFiles.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestDeleteOrphanFiles.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestDeleteOrphanFiles.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestDeleteOrphanFiles.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestExpireSnapshots.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestExpireSnapshots.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestExpireSnapshots.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestExpireSnapshots.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestLockFactoryBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestLockFactoryBase.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestLockFactoryBase.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestLockFactoryBase.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestMaintenanceE2E.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestMaintenanceE2E.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestMaintenanceE2E.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestMaintenanceE2E.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFilesConfig.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFilesConfig.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFilesConfig.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFilesConfig.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestTableMaintenance.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestTableMaintenance.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestTableMaintenance.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestTableMaintenance.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestZkLockFactory.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestZkLockFactory.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestZkLockFactory.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestZkLockFactory.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/MetricsReporterFactoryForTests.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/MetricsReporterFactoryForTests.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/MetricsReporterFactoryForTests.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/MetricsReporterFactoryForTests.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteCommitter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteCommitter.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteCommitter.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteCommitter.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDeleteFilesProcessor.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDeleteFilesProcessor.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDeleteFilesProcessor.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDeleteFilesProcessor.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestExpireSnapshotsProcessor.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestExpireSnapshotsProcessor.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestExpireSnapshotsProcessor.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestExpireSnapshotsProcessor.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListFileSystemFiles.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListFileSystemFiles.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListFileSystemFiles.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListFileSystemFiles.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListMetadataFiles.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListMetadataFiles.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListMetadataFiles.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListMetadataFiles.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockConfig.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockConfig.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockConfig.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockConfig.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockFactoryBuilder.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockFactoryBuilder.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockFactoryBuilder.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockFactoryBuilder.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockRemover.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockRemover.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockRemover.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockRemover.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestOrphanFilesDetector.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestOrphanFilesDetector.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestOrphanFilesDetector.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestOrphanFilesDetector.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestSkipOnError.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestSkipOnError.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestSkipOnError.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestSkipOnError.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTablePlanerAndReader.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTablePlanerAndReader.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTablePlanerAndReader.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTablePlanerAndReader.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTaskResultAggregator.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTaskResultAggregator.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTaskResultAggregator.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTaskResultAggregator.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTriggerManager.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTriggerManager.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTriggerManager.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTriggerManager.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/SinkTestUtil.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/SinkTestUtil.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/SinkTestUtil.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/SinkTestUtil.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestCommittableToTableChangeConverter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestCommittableToTableChangeConverter.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestCommittableToTableChangeConverter.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestCommittableToTableChangeConverter.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkRangeDistributionBucketing.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkRangeDistributionBucketing.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkRangeDistributionBucketing.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkRangeDistributionBucketing.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2DistributionMode.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2DistributionMode.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2DistributionMode.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2DistributionMode.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergCommitter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergCommitter.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergCommitter.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergCommitter.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSink.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSink.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSink.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSink.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkBranch.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkBranch.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkBranch.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkBranch.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkCompact.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkCompact.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkCompact.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkCompact.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2Branch.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2Branch.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2Branch.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2Branch.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializerTestBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializerTestBase.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializerTestBase.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializerTestBase.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestCompareSchemasVisitor.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestCompareSchemasVisitor.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestCompareSchemasVisitor.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestCompareSchemasVisitor.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommittableSerializer.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommittableSerializer.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommittableSerializer.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommittableSerializer.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommitter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommitter.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommitter.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommitter.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSinkPerf.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSinkPerf.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSinkPerf.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSinkPerf.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchema.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchema.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchema.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchema.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchemaId.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchemaId.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchemaId.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchemaId.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultAggregator.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultAggregator.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultAggregator.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultAggregator.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultSerializer.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultSerializer.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultSerializer.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultSerializer.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriter.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriter.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriter.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestEvolveSchemaVisitor.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestEvolveSchemaVisitor.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestEvolveSchemaVisitor.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestEvolveSchemaVisitor.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestLRUCache.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestLRUCache.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestLRUCache.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestLRUCache.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestPartitionSpecEvolution.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestPartitionSpecEvolution.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestPartitionSpecEvolution.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestPartitionSpecEvolution.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestRowDataConverter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestRowDataConverter.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestRowDataConverter.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestRowDataConverter.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableMetadataCache.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableMetadataCache.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableMetadataCache.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableMetadataCache.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableSerializerCache.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableSerializerCache.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableSerializerCache.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableSerializerCache.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableUpdater.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableUpdater.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableUpdater.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableUpdater.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataDistributionUtil.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataDistributionUtil.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataDistributionUtil.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataDistributionUtil.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitionerSkew.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitionerSkew.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitionerSkew.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitionerSkew.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestStatisticsOrRecordTypeInformation.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestStatisticsOrRecordTypeInformation.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestStatisticsOrRecordTypeInformation.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestStatisticsOrRecordTypeInformation.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedConverterBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedConverterBase.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedConverterBase.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedConverterBase.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedRow.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedRow.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedRow.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedRow.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceInferParallelism.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceInferParallelism.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceInferParallelism.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceInferParallelism.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java similarity index 100% rename from flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java rename to flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java diff --git a/flink/v2.0/flink/src/test/resources/META-INF/services/org.apache.flink.metrics.reporter.MetricReporterFactory b/flink/v2.1/flink/src/test/resources/META-INF/services/org.apache.flink.metrics.reporter.MetricReporterFactory similarity index 100% rename from flink/v2.0/flink/src/test/resources/META-INF/services/org.apache.flink.metrics.reporter.MetricReporterFactory rename to flink/v2.1/flink/src/test/resources/META-INF/services/org.apache.flink.metrics.reporter.MetricReporterFactory diff --git a/flink/v2.0/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory b/flink/v2.1/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory similarity index 100% rename from flink/v2.0/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory rename to flink/v2.1/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory From 10cedb3a839408bc810af16eba9802b1d158c8e0 Mon Sep 17 00:00:00 2001 From: Maximilian Michels Date: Thu, 31 Jul 2025 16:14:07 +0200 Subject: [PATCH 2/5] Flink: Add back v2.0 directory --- flink/v2.0/build.gradle | 271 ++++ flink/v2.0/flink-runtime/LICENSE | 520 ++++++ flink/v2.0/flink-runtime/NOTICE | 360 +++++ .../flink/TestIcebergConnectorSmoke.java | 21 + ...RecordSerializerDeserializerBenchmark.java | 138 ++ .../shuffle/MapRangePartitionerBenchmark.java | 121 ++ .../SketchRangePartitionerBenchmark.java | 114 ++ .../types/FlinkScalaKryoInstantiator.java | 26 + .../apache/iceberg/flink/CatalogLoader.java | 215 +++ .../apache/iceberg/flink/FlinkCatalog.java | 872 ++++++++++ .../iceberg/flink/FlinkCatalogFactory.java | 216 +++ .../apache/iceberg/flink/FlinkConfParser.java | 297 ++++ .../iceberg/flink/FlinkConfigOptions.java | 113 ++ .../flink/FlinkCreateTableOptions.java | 116 ++ .../flink/FlinkDynamicTableFactory.java | 235 +++ .../flink/FlinkEnvironmentContext.java | 31 + .../apache/iceberg/flink/FlinkFilters.java | 266 +++ .../apache/iceberg/flink/FlinkFixupTypes.java | 50 + .../apache/iceberg/flink/FlinkReadConf.java | 213 +++ .../iceberg/flink/FlinkReadOptions.java | 123 ++ .../apache/iceberg/flink/FlinkRowData.java | 47 + .../apache/iceberg/flink/FlinkSchemaUtil.java | 380 +++++ .../iceberg/flink/FlinkSourceFilter.java | 49 + .../apache/iceberg/flink/FlinkTypeToType.java | 203 +++ .../iceberg/flink/FlinkTypeVisitor.java | 80 + .../apache/iceberg/flink/FlinkWriteConf.java | 239 +++ .../iceberg/flink/FlinkWriteOptions.java | 90 ++ .../iceberg/flink/IcebergTableSink.java | 182 +++ .../apache/iceberg/flink/RowDataWrapper.java | 142 ++ .../org/apache/iceberg/flink/TableLoader.java | 159 ++ .../apache/iceberg/flink/TypeToFlinkType.java | 146 ++ .../apache/iceberg/flink/actions/Actions.java | 52 + .../flink/actions/RewriteDataFilesAction.java | 76 + .../data/AvroWithFlinkSchemaVisitor.java | 75 + .../iceberg/flink/data/FlinkAvroWriter.java | 168 ++ .../iceberg/flink/data/FlinkOrcReader.java | 131 ++ .../iceberg/flink/data/FlinkOrcReaders.java | 283 ++++ .../iceberg/flink/data/FlinkOrcWriter.java | 163 ++ .../iceberg/flink/data/FlinkOrcWriters.java | 318 ++++ .../flink/data/FlinkParquetReaders.java | 860 ++++++++++ .../flink/data/FlinkParquetWriters.java | 608 +++++++ .../flink/data/FlinkPlannedAvroReader.java | 196 +++ .../flink/data/FlinkSchemaVisitor.java | 161 ++ .../iceberg/flink/data/FlinkValueReaders.java | 356 ++++ .../iceberg/flink/data/FlinkValueWriters.java | 269 +++ .../data/ParquetWithFlinkSchemaVisitor.java | 223 +++ .../iceberg/flink/data/RowDataProjection.java | 342 ++++ .../iceberg/flink/data/RowDataUtil.java | 109 ++ .../iceberg/flink/data/StructRowData.java | 300 ++++ .../maintenance/api/DeleteOrphanFiles.java | 324 ++++ .../maintenance/api/ExpireSnapshots.java | 144 ++ .../api/FlinkMaintenanceConfig.java | 128 ++ .../maintenance/api/JdbcLockFactory.java | 325 ++++ .../flink/maintenance/api/LockConfig.java | 218 +++ .../api/MaintenanceTaskBuilder.java | 229 +++ .../maintenance/api/RewriteDataFiles.java | 294 ++++ .../api/RewriteDataFilesConfig.java | 184 +++ .../maintenance/api/TableMaintenance.java | 336 ++++ .../flink/maintenance/api/TaskResult.java | 64 + .../flink/maintenance/api/Trigger.java | 65 + .../maintenance/api/TriggerLockFactory.java | 64 + .../flink/maintenance/api/ZkLockFactory.java | 225 +++ .../operator/DataFileRewriteCommitter.java | 199 +++ .../operator/DataFileRewritePlanner.java | 217 +++ .../operator/DataFileRewriteRunner.java | 253 +++ .../operator/DeleteFilesProcessor.java | 121 ++ .../operator/ExpireSnapshotsProcessor.java | 136 ++ .../maintenance/operator/FileNameReader.java | 49 + .../operator/FileUriKeySelector.java | 60 + .../operator/ListFileSystemFiles.java | 133 ++ .../operator/ListMetadataFiles.java | 93 ++ .../operator/LockFactoryBuilder.java | 87 + .../maintenance/operator/LockRemover.java | 146 ++ .../flink/maintenance/operator/LogUtil.java | 26 + .../operator/MetadataTablePlanner.java | 133 ++ .../maintenance/operator/MonitorSource.java | 206 +++ .../operator/OrphanFilesDetector.java | 191 +++ .../SingleThreadedIteratorSource.java | 197 +++ .../maintenance/operator/SkipOnError.java | 90 ++ .../maintenance/operator/TableChange.java | 244 +++ .../operator/TableMaintenanceMetrics.java | 75 + .../maintenance/operator/TableReader.java | 120 ++ .../operator/TaskResultAggregator.java | 104 ++ .../operator/TriggerEvaluator.java | 151 ++ .../maintenance/operator/TriggerManager.java | 327 ++++ .../AvroGenericRecordToRowDataMapper.java | 61 + .../flink/sink/BaseDeltaTaskWriter.java | 126 ++ .../sink/BucketPartitionKeySelector.java | 70 + .../iceberg/flink/sink/BucketPartitioner.java | 103 ++ .../flink/sink/BucketPartitionerUtil.java | 125 ++ .../flink/sink/CachingTableSupplier.java | 91 ++ .../iceberg/flink/sink/CommitSummary.java | 103 ++ .../CommittableToTableChangeConverter.java | 107 ++ .../iceberg/flink/sink/DeltaManifests.java | 73 + .../flink/sink/DeltaManifestsSerializer.java | 124 ++ .../flink/sink/EqualityFieldKeySelector.java | 88 + .../flink/sink/FlinkAppenderFactory.java | 274 ++++ .../flink/sink/FlinkFileWriterFactory.java | 293 ++++ .../iceberg/flink/sink/FlinkManifestUtil.java | 168 ++ .../apache/iceberg/flink/sink/FlinkSink.java | 783 +++++++++ .../iceberg/flink/sink/FlinkWriteResult.java | 40 + .../flink/sink/IcebergCommittable.java | 95 ++ .../sink/IcebergCommittableSerializer.java | 68 + .../iceberg/flink/sink/IcebergCommitter.java | 317 ++++ .../flink/sink/IcebergFilesCommitter.java | 486 ++++++ .../sink/IcebergFilesCommitterFactory.java | 72 + .../sink/IcebergFilesCommitterMetrics.java | 75 + .../iceberg/flink/sink/IcebergSink.java | 972 +++++++++++ .../flink/sink/IcebergSinkBuilder.java | 103 ++ .../iceberg/flink/sink/IcebergSinkWriter.java | 113 ++ .../flink/sink/IcebergStreamWriter.java | 119 ++ .../sink/IcebergStreamWriterMetrics.java | 100 ++ .../flink/sink/IcebergWriteAggregator.java | 127 ++ .../flink/sink/ManifestOutputFileFactory.java | 98 ++ .../flink/sink/PartitionKeySelector.java | 66 + .../flink/sink/PartitionedDeltaWriter.java | 97 ++ .../flink/sink/RowDataTaskWriterFactory.java | 268 +++ .../apache/iceberg/flink/sink/SinkUtil.java | 150 ++ .../iceberg/flink/sink/TaskWriterFactory.java | 45 + .../flink/sink/UnpartitionedDeltaWriter.java | 69 + .../flink/sink/WriteResultSerializer.java | 63 + .../sink/dynamic/CompareSchemasVisitor.java | 275 ++++ .../flink/sink/dynamic/DataConverter.java | 235 +++ .../sink/dynamic/DynamicCommittable.java | 104 ++ .../dynamic/DynamicCommittableSerializer.java | 71 + .../flink/sink/dynamic/DynamicCommitter.java | 424 +++++ .../sink/dynamic/DynamicCommitterMetrics.java | 50 + .../sink/dynamic/DynamicIcebergSink.java | 427 +++++ .../flink/sink/dynamic/DynamicRecord.java | 130 ++ .../sink/dynamic/DynamicRecordGenerator.java | 34 + .../sink/dynamic/DynamicRecordInternal.java | 166 ++ .../DynamicRecordInternalSerializer.java | 295 ++++ .../dynamic/DynamicRecordInternalType.java | 96 ++ .../sink/dynamic/DynamicRecordProcessor.java | 184 +++ .../flink/sink/dynamic/DynamicSinkUtil.java | 65 + .../dynamic/DynamicTableUpdateOperator.java | 84 + .../sink/dynamic/DynamicWriteResult.java | 40 + .../dynamic/DynamicWriteResultAggregator.java | 188 +++ .../dynamic/DynamicWriteResultSerializer.java | 62 + .../flink/sink/dynamic/DynamicWriter.java | 213 +++ .../sink/dynamic/DynamicWriterMetrics.java | 49 + .../sink/dynamic/EvolveSchemaVisitor.java | 204 +++ .../flink/sink/dynamic/HashKeyGenerator.java | 382 +++++ .../iceberg/flink/sink/dynamic/LRUCache.java | 64 + .../sink/dynamic/PartitionSpecEvolution.java | 137 ++ .../sink/dynamic/TableMetadataCache.java | 272 ++++ .../sink/dynamic/TableSerializerCache.java | 133 ++ .../flink/sink/dynamic/TableUpdater.java | 214 +++ .../flink/sink/dynamic/WriteTarget.java | 144 ++ .../shuffle/AggregatedStatisticsTracker.java | 264 +++ .../sink/shuffle/CompletedStatistics.java | 128 ++ .../CompletedStatisticsSerializer.java | 188 +++ .../flink/sink/shuffle/DataStatistics.java | 48 + .../shuffle/DataStatisticsCoordinator.java | 536 ++++++ .../DataStatisticsCoordinatorProvider.java | 70 + .../sink/shuffle/DataStatisticsOperator.java | 269 +++ .../DataStatisticsOperatorFactory.java | 96 ++ .../shuffle/DataStatisticsSerializer.java | 204 +++ .../flink/sink/shuffle/GlobalStatistics.java | 114 ++ .../shuffle/GlobalStatisticsSerializer.java | 199 +++ .../flink/sink/shuffle/KeyAssignment.java | 155 ++ .../flink/sink/shuffle/MapAssignment.java | 242 +++ .../flink/sink/shuffle/MapDataStatistics.java | 88 + .../sink/shuffle/MapRangePartitioner.java | 95 ++ .../flink/sink/shuffle/RangePartitioner.java | 110 ++ .../shuffle/RequestGlobalStatisticsEvent.java | 40 + .../sink/shuffle/SketchDataStatistics.java | 87 + .../sink/shuffle/SketchRangePartitioner.java | 51 + .../flink/sink/shuffle/SketchUtil.java | 161 ++ .../flink/sink/shuffle/SortKeySerializer.java | 406 +++++ .../sink/shuffle/SortKeySketchSerializer.java | 143 ++ .../flink/sink/shuffle/SortKeyUtil.java | 58 + .../flink/sink/shuffle/StatisticsEvent.java | 76 + .../sink/shuffle/StatisticsOrRecord.java | 112 ++ .../shuffle/StatisticsOrRecordSerializer.java | 208 +++ .../StatisticsOrRecordTypeInformation.java | 109 ++ .../flink/sink/shuffle/StatisticsType.java | 55 + .../flink/sink/shuffle/StatisticsUtil.java | 143 ++ .../AvroGenericRecordFileScanTaskReader.java | 42 + .../iceberg/flink/source/DataIterator.java | 160 ++ .../iceberg/flink/source/DataTaskReader.java | 47 + .../flink/source/FileScanTaskReader.java | 35 + .../flink/source/FlinkInputFormat.java | 141 ++ .../iceberg/flink/source/FlinkInputSplit.java | 48 + .../iceberg/flink/source/FlinkSource.java | 318 ++++ .../flink/source/FlinkSplitPlanner.java | 189 +++ .../iceberg/flink/source/IcebergSource.java | 702 ++++++++ .../flink/source/IcebergTableSource.java | 239 +++ .../source/RowDataFileScanTaskReader.java | 243 +++ .../iceberg/flink/source/RowDataRewriter.java | 166 ++ .../RowDataToAvroGenericRecordConverter.java | 70 + .../iceberg/flink/source/ScanContext.java | 597 +++++++ .../iceberg/flink/source/SourceUtil.java | 77 + .../source/StreamingMonitorFunction.java | 269 +++ .../flink/source/StreamingReaderOperator.java | 247 +++ .../source/StreamingStartingStrategy.java | 61 + .../source/assigner/DefaultSplitAssigner.java | 119 ++ .../flink/source/assigner/GetSplitResult.java | 77 + .../assigner/OrderedSplitAssignerFactory.java | 46 + .../assigner/SimpleSplitAssignerFactory.java | 37 + .../flink/source/assigner/SplitAssigner.java | 124 ++ .../source/assigner/SplitAssignerFactory.java | 30 + .../source/assigner/SplitAssignerType.java | 33 + .../enumerator/AbstractIcebergEnumerator.java | 188 +++ .../ContinuousEnumerationResult.java | 57 + .../ContinuousIcebergEnumerator.java | 188 +++ .../enumerator/ContinuousSplitPlanner.java | 30 + .../ContinuousSplitPlannerImpl.java | 248 +++ .../source/enumerator/EnumerationHistory.java | 100 ++ .../enumerator/IcebergEnumeratorPosition.java | 79 + .../IcebergEnumeratorPositionSerializer.java | 90 ++ .../enumerator/IcebergEnumeratorState.java | 65 + .../IcebergEnumeratorStateSerializer.java | 194 +++ .../enumerator/StaticIcebergEnumerator.java | 51 + .../source/reader/ArrayBatchRecords.java | 171 ++ .../reader/ArrayPoolDataIteratorBatcher.java | 130 ++ .../reader/AvroGenericRecordConverter.java | 69 + .../AvroGenericRecordReaderFunction.java | 110 ++ .../reader/ColumnStatsWatermarkExtractor.java | 98 ++ .../reader/ConverterReaderFunction.java | 117 ++ .../source/reader/DataIteratorBatcher.java | 36 + .../reader/DataIteratorReaderFunction.java | 43 + .../source/reader/IcebergSourceReader.java | 77 + .../reader/IcebergSourceReaderMetrics.java | 61 + .../reader/IcebergSourceSplitReader.java | 167 ++ .../source/reader/LimitableDataIterator.java | 56 + .../flink/source/reader/ListBatchRecords.java | 85 + .../reader/ListDataIteratorBatcher.java | 94 ++ .../source/reader/MetaDataReaderFunction.java | 65 + .../flink/source/reader/ReaderFunction.java | 31 + .../source/reader/RecordAndPosition.java | 79 + .../flink/source/reader/RecordFactory.java | 34 + .../flink/source/reader/RecordLimiter.java | 45 + .../flink/source/reader/RowConverter.java | 64 + .../flink/source/reader/RowDataConverter.java | 32 + .../source/reader/RowDataReaderFunction.java | 115 ++ .../source/reader/RowDataRecordFactory.java | 74 + .../reader/SerializableRecordEmitter.java | 40 + .../reader/SplitWatermarkExtractor.java | 28 + .../WatermarkExtractorRecordEmitter.java | 67 + .../source/split/IcebergSourceSplit.java | 220 +++ .../split/IcebergSourceSplitSerializer.java | 64 + .../source/split/IcebergSourceSplitState.java | 37 + .../split/IcebergSourceSplitStatus.java | 25 + .../source/split/SerializableComparator.java | 24 + .../flink/source/split/SerializerHelper.java | 206 +++ .../flink/source/split/SplitComparators.java | 78 + .../flink/source/split/SplitRequestEvent.java | 54 + .../iceberg/flink/util/ElapsedTimeGauge.java | 47 + .../flink/util/FlinkAlterTableUtil.java | 248 +++ .../flink/util/FlinkCompatibilityUtil.java | 56 + .../iceberg/flink/util/FlinkPackage.java | 61 + .../org.apache.flink.table.factories.Factory | 17 + .../flink/AvroGenericRecordConverterBase.java | 90 ++ .../apache/iceberg/flink/CatalogTestBase.java | 121 ++ .../apache/iceberg/flink/DataGenerator.java | 42 + .../apache/iceberg/flink/DataGenerators.java | 1172 ++++++++++++++ .../iceberg/flink/HadoopCatalogExtension.java | 105 ++ .../iceberg/flink/HadoopTableExtension.java | 59 + .../flink/MiniFlinkClusterExtension.java | 67 + .../iceberg/flink/RowDataConverter.java | 145 ++ .../apache/iceberg/flink/SimpleDataUtil.java | 469 ++++++ .../org/apache/iceberg/flink/SqlBase.java | 110 ++ .../org/apache/iceberg/flink/TestBase.java | 145 ++ .../iceberg/flink/TestCatalogLoader.java | 116 ++ .../iceberg/flink/TestCatalogTableLoader.java | 113 ++ .../iceberg/flink/TestChangeLogTable.java | 296 ++++ .../flink/TestDataFileSerialization.java | 205 +++ .../apache/iceberg/flink/TestFixtures.java | 61 + .../flink/TestFlinkAnonymousTable.java | 65 + .../flink/TestFlinkCatalogDatabase.java | 253 +++ .../flink/TestFlinkCatalogFactory.java | 119 ++ .../iceberg/flink/TestFlinkCatalogTable.java | 722 +++++++++ .../TestFlinkCatalogTablePartitions.java | 119 ++ .../iceberg/flink/TestFlinkConfParser.java | 61 + .../iceberg/flink/TestFlinkFilters.java | 467 ++++++ .../iceberg/flink/TestFlinkHiveCatalog.java | 101 ++ .../iceberg/flink/TestFlinkSchemaUtil.java | 474 ++++++ .../iceberg/flink/TestFlinkTableSink.java | 266 +++ .../flink/TestFlinkTableSinkCompaction.java | 184 +++ .../flink/TestFlinkTableSinkExtended.java | 388 +++++ .../apache/iceberg/flink/TestFlinkUpsert.java | 336 ++++ .../org/apache/iceberg/flink/TestHelpers.java | 669 ++++++++ .../iceberg/flink/TestIcebergConnector.java | 331 ++++ .../flink/TestManifestFileSerialization.java | 173 ++ .../iceberg/flink/TestRowDataWrapper.java | 94 ++ .../apache/iceberg/flink/TestTableLoader.java | 57 + .../iceberg/flink/TestTableSerialization.java | 110 ++ .../actions/TestRewriteDataFilesAction.java | 523 ++++++ .../iceberg/flink/data/RandomRowData.java | 38 + .../flink/data/RowDataToRowMapper.java | 50 + .../flink/data/TestFlinkAvroReaderWriter.java | 136 ++ .../flink/data/TestFlinkOrcReaderWriter.java | 118 ++ .../flink/data/TestFlinkParquetReader.java | 268 +++ .../flink/data/TestFlinkParquetWriter.java | 121 ++ .../flink/data/TestRowDataProjection.java | 593 +++++++ .../iceberg/flink/data/TestRowProjection.java | 582 +++++++ .../iceberg/flink/data/TestStructRowData.java | 100 ++ .../api/MaintenanceTaskInfraExtension.java | 78 + .../api/MaintenanceTaskTestBase.java | 120 ++ .../api/TestDeleteOrphanFiles.java | 340 ++++ .../maintenance/api/TestExpireSnapshots.java | 233 +++ .../maintenance/api/TestJdbcLockFactory.java | 41 + .../maintenance/api/TestLockFactoryBase.java | 93 ++ .../maintenance/api/TestMaintenanceE2E.java | 80 + .../maintenance/api/TestRewriteDataFiles.java | 457 ++++++ .../api/TestRewriteDataFilesConfig.java | 142 ++ .../maintenance/api/TestTableMaintenance.java | 462 ++++++ .../maintenance/api/TestZkLockFactory.java | 54 + .../maintenance/operator/CollectingSink.java | 116 ++ .../maintenance/operator/ManualSource.java | 320 ++++ .../MetricsReporterFactoryForTests.java | 192 +++ .../operator/OperatorTestBase.java | 397 +++++ .../maintenance/operator/RewriteUtil.java | 85 + .../TestDataFileRewriteCommitter.java | 278 ++++ .../operator/TestDataFileRewritePlanner.java | 210 +++ .../operator/TestDataFileRewriteRunner.java | 357 ++++ .../operator/TestDeleteFilesProcessor.java | 115 ++ .../TestExpireSnapshotsProcessor.java | 122 ++ .../operator/TestListFileSystemFiles.java | 110 ++ .../operator/TestListMetadataFiles.java | 90 ++ .../maintenance/operator/TestLockConfig.java | 84 + .../operator/TestLockFactoryBuilder.java | 109 ++ .../maintenance/operator/TestLockRemover.java | 444 +++++ .../operator/TestMonitorSource.java | 355 ++++ .../operator/TestOrphanFilesDetector.java | 246 +++ .../maintenance/operator/TestSkipOnError.java | 100 ++ .../operator/TestTablePlanerAndReader.java | 120 ++ .../operator/TestTaskResultAggregator.java | 78 + .../operator/TestTriggerManager.java | 671 ++++++++ .../iceberg/flink/sink/SinkTestUtil.java | 62 + .../TestAvroGenericRecordToRowDataMapper.java | 38 + .../sink/TestBucketPartitionKeySelector.java | 67 + .../flink/sink/TestBucketPartitioner.java | 108 ++ ...TestBucketPartitionerFlinkIcebergSink.java | 233 +++ .../flink/sink/TestBucketPartitionerUtil.java | 126 ++ .../flink/sink/TestCachingTableSupplier.java | 81 + ...TestCommittableToTableChangeConverter.java | 319 ++++ .../flink/sink/TestCompressionSettings.java | 258 +++ .../flink/sink/TestDeltaTaskWriter.java | 428 +++++ .../flink/sink/TestFlinkAppenderFactory.java | 65 + .../sink/TestFlinkFileWriterFactory.java | 66 + .../flink/sink/TestFlinkIcebergSink.java | 143 ++ .../flink/sink/TestFlinkIcebergSinkBase.java | 133 ++ .../sink/TestFlinkIcebergSinkBranch.java | 158 ++ .../TestFlinkIcebergSinkDistributionMode.java | 602 +++++++ .../sink/TestFlinkIcebergSinkExtended.java | 250 +++ ...IcebergSinkRangeDistributionBucketing.java | 255 +++ .../flink/sink/TestFlinkIcebergSinkV2.java | 254 +++ .../sink/TestFlinkIcebergSinkV2Base.java | 424 +++++ .../sink/TestFlinkIcebergSinkV2Branch.java | 137 ++ ...estFlinkIcebergSinkV2DistributionMode.java | 618 +++++++ .../iceberg/flink/sink/TestFlinkManifest.java | 313 ++++ .../sink/TestFlinkPartitioningWriters.java | 77 + .../sink/TestFlinkPositionDeltaWriters.java | 66 + .../sink/TestFlinkRollingFileWriters.java | 51 + .../flink/sink/TestFlinkWriterMetrics.java | 60 + .../flink/sink/TestIcebergCommitter.java | 1435 +++++++++++++++++ .../flink/sink/TestIcebergFilesCommitter.java | 1238 ++++++++++++++ .../iceberg/flink/sink/TestIcebergSink.java | 563 +++++++ .../flink/sink/TestIcebergSinkBranch.java | 142 ++ .../flink/sink/TestIcebergSinkCompact.java | 149 ++ .../iceberg/flink/sink/TestIcebergSinkV2.java | 287 ++++ .../flink/sink/TestIcebergSinkV2Branch.java | 110 ++ .../flink/sink/TestIcebergStreamWriter.java | 409 +++++ .../flink/sink/TestRowDataPartitionKey.java | 251 +++ .../iceberg/flink/sink/TestTaskWriters.java | 241 +++ ...namicRecordInternalSerializerTestBase.java | 96 ++ .../dynamic/TestCompareSchemasVisitor.java | 229 +++ .../TestDynamicCommittableSerializer.java | 62 + .../sink/dynamic/TestDynamicCommitter.java | 381 +++++ .../sink/dynamic/TestDynamicIcebergSink.java | 850 ++++++++++ .../dynamic/TestDynamicIcebergSinkPerf.java | 245 +++ ...icRecordInternalSerializerWriteSchema.java | 28 + ...RecordInternalSerializerWriteSchemaId.java | 28 + .../TestDynamicTableUpdateOperator.java | 120 ++ .../TestDynamicWriteResultAggregator.java | 82 + .../TestDynamicWriteResultSerializer.java | 82 + .../flink/sink/dynamic/TestDynamicWriter.java | 254 +++ .../sink/dynamic/TestEvolveSchemaVisitor.java | 626 +++++++ .../sink/dynamic/TestHashKeyGenerator.java | 350 ++++ .../flink/sink/dynamic/TestLRUCache.java | 88 + .../dynamic/TestPartitionSpecEvolution.java | 188 +++ .../sink/dynamic/TestRowDataConverter.java | 262 +++ .../sink/dynamic/TestTableMetadataCache.java | 94 ++ .../dynamic/TestTableSerializerCache.java | 124 ++ .../flink/sink/dynamic/TestTableUpdater.java | 160 ++ .../sink/shuffle/DataDistributionUtil.java | 178 ++ .../iceberg/flink/sink/shuffle/Fixtures.java | 100 ++ .../TestAggregatedStatisticsTracker.java | 465 ++++++ .../TestCompletedStatisticsSerializer.java | 103 ++ .../shuffle/TestDataDistributionUtil.java | 49 + .../TestDataStatisticsCoordinator.java | 373 +++++ ...TestDataStatisticsCoordinatorProvider.java | 187 +++ .../shuffle/TestDataStatisticsOperator.java | 374 +++++ .../shuffle/TestDataStatisticsSerializer.java | 53 + .../TestGlobalStatisticsSerializer.java | 59 + .../sink/shuffle/TestMapDataStatistics.java | 67 + .../sink/shuffle/TestMapRangePartitioner.java | 436 +++++ .../sink/shuffle/TestRangePartitioner.java | 65 + .../shuffle/TestRangePartitionerSkew.java | 183 +++ .../shuffle/TestSketchDataStatistics.java | 60 + .../shuffle/TestSketchRangePartitioner.java | 88 + .../flink/sink/shuffle/TestSketchUtil.java | 200 +++ .../shuffle/TestSortKeySerializerBase.java | 65 + .../TestSortKeySerializerNestedStruct.java | 55 + .../TestSortKeySerializerPrimitives.java | 90 ++ .../TestSortKeySerializerSnapshot.java | 235 +++ .../flink/sink/shuffle/TestSortKeyUtil.java | 73 + ...TestStatisticsOrRecordTypeInformation.java | 46 + .../flink/source/BoundedTableFactory.java | 184 +++ .../flink/source/BoundedTestSource.java | 108 ++ .../flink/source/ChangeLogTableTestBase.java | 95 ++ .../iceberg/flink/source/SplitHelpers.java | 200 +++ .../iceberg/flink/source/SqlHelpers.java | 60 + .../flink/source/TableSourceTestBase.java | 105 ++ .../flink/source/TestBoundedTableFactory.java | 81 + .../flink/source/TestFlinkInputFormat.java | 211 +++ .../TestFlinkInputFormatReaderDeletes.java | 71 + .../flink/source/TestFlinkMergingMetrics.java | 67 + .../flink/source/TestFlinkMetaDataTable.java | 813 ++++++++++ .../source/TestFlinkReaderDeletesBase.java | 90 ++ .../iceberg/flink/source/TestFlinkScan.java | 540 +++++++ .../flink/source/TestFlinkScanSql.java | 69 + .../iceberg/flink/source/TestFlinkSource.java | 90 ++ .../flink/source/TestFlinkSourceConfig.java | 61 + .../flink/source/TestFlinkSourceSql.java | 87 + .../flink/source/TestFlinkTableSource.java | 561 +++++++ .../source/TestIcebergSourceBounded.java | 151 ++ ...TestIcebergSourceBoundedConverterBase.java | 223 +++ ...TestIcebergSourceBoundedGenericRecord.java | 96 ++ .../source/TestIcebergSourceBoundedRow.java | 58 + .../source/TestIcebergSourceBoundedSql.java | 76 + .../source/TestIcebergSourceContinuous.java | 573 +++++++ .../source/TestIcebergSourceFailover.java | 397 +++++ ...gSourceFailoverWithWatermarkExtractor.java | 130 ++ .../TestIcebergSourceInferParallelism.java | 181 +++ .../TestIcebergSourceReaderDeletes.java | 102 ++ .../flink/source/TestIcebergSourceSql.java | 233 +++ ...stIcebergSourceWithWatermarkExtractor.java | 408 +++++ ...estIcebergSpeculativeExecutionSupport.java | 206 +++ .../TestMetadataTableReadableMetrics.java | 364 +++++ .../flink/source/TestProjectMetaColumn.java | 189 +++ ...stRowDataToAvroGenericRecordConverter.java | 36 + .../iceberg/flink/source/TestScanContext.java | 112 ++ .../iceberg/flink/source/TestSourceUtil.java | 61 + .../iceberg/flink/source/TestSqlBase.java | 173 ++ .../flink/source/TestStreamScanSql.java | 490 ++++++ .../source/TestStreamingMonitorFunction.java | 399 +++++ .../source/TestStreamingReaderOperator.java | 288 ++++ .../assigner/SplitAssignerTestBase.java | 132 ++ .../assigner/TestDefaultSplitAssigner.java | 43 + ...tFileSequenceNumberBasedSplitAssigner.java | 81 + .../TestWatermarkBasedSplitAssigner.java | 146 ++ .../ManualContinuousSplitPlanner.java | 97 ++ .../TestContinuousIcebergEnumerator.java | 352 ++++ .../TestContinuousSplitPlannerImpl.java | 734 +++++++++ ...ntinuousSplitPlannerImplStartStrategy.java | 219 +++ .../enumerator/TestEnumerationHistory.java | 135 ++ .../TestIcebergEnumeratorStateSerializer.java | 146 ++ .../source/reader/ReaderFunctionTestBase.java | 218 +++ .../flink/source/reader/ReaderUtil.java | 128 ++ .../source/reader/TestArrayBatchRecords.java | 69 + ...stArrayPoolDataIteratorBatcherRowData.java | 360 +++++ .../TestColumnStatsWatermarkExtractor.java | 176 ++ .../reader/TestIcebergSourceReader.java | 181 +++ .../reader/TestLimitableDataIterator.java | 84 + .../reader/TestRowDataReaderFunction.java | 69 + .../source/reader/TestingMetricGroup.java | 102 ++ .../TestIcebergSourceSplitSerializer.java | 183 +++ .../iceberg/flink/util/TestFlinkPackage.java | 55 + ...ink.metrics.reporter.MetricReporterFactory | 16 + .../org.apache.flink.table.factories.Factory | 16 + 473 files changed, 88428 insertions(+) create mode 100644 flink/v2.0/build.gradle create mode 100644 flink/v2.0/flink-runtime/LICENSE create mode 100644 flink/v2.0/flink-runtime/NOTICE create mode 100644 flink/v2.0/flink-runtime/src/integration/java/org/apache/iceberg/flink/TestIcebergConnectorSmoke.java create mode 100644 flink/v2.0/flink/src/jmh/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordSerializerDeserializerBenchmark.java create mode 100644 flink/v2.0/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java create mode 100644 flink/v2.0/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitionerBenchmark.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/flink/table/api/runtime/types/FlinkScalaKryoInstantiator.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkCreateTableOptions.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkRowData.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkPlannedAvroReader.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/DeleteOrphanFiles.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ExpireSnapshots.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/FlinkMaintenanceConfig.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/LockConfig.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskBuilder.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFilesConfig.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TableMaintenance.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TaskResult.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/Trigger.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TriggerLockFactory.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ZkLockFactory.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteCommitter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteRunner.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DeleteFilesProcessor.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ExpireSnapshotsProcessor.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileNameReader.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileUriKeySelector.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListFileSystemFiles.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListMetadataFiles.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockFactoryBuilder.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockRemover.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LogUtil.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MetadataTablePlanner.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/OrphanFilesDetector.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SkipOnError.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableMaintenanceMetrics.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableReader.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TaskResultAggregator.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerEvaluator.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerManager.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/CommittableToTableChangeConverter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkWriteResult.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittable.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittableSerializer.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommitter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterFactory.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSink.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkBuilder.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkWriter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergWriteAggregator.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/SinkUtil.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/WriteResultSerializer.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/CompareSchemasVisitor.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DataConverter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittable.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittableSerializer.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitterMetrics.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordGenerator.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternal.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializer.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalType.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicSinkUtil.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResult.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultAggregator.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultSerializer.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriterMetrics.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/EvolveSchemaVisitor.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/LRUCache.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/PartitionSpecEvolution.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableMetadataCache.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableSerializerCache.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableUpdater.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/WriteTarget.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperatorFactory.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordTypeInformation.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ConverterReaderFunction.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowConverter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataConverter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java create mode 100644 flink/v2.0/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestBase.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkCompaction.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskInfraExtension.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskTestBase.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestDeleteOrphanFiles.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestExpireSnapshots.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestLockFactoryBase.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestMaintenanceE2E.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFilesConfig.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestTableMaintenance.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestZkLockFactory.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/MetricsReporterFactoryForTests.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteCommitter.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDeleteFilesProcessor.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestExpireSnapshotsProcessor.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListFileSystemFiles.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListMetadataFiles.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockConfig.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockFactoryBuilder.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockRemover.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestOrphanFilesDetector.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestSkipOnError.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTablePlanerAndReader.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTaskResultAggregator.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTriggerManager.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/SinkTestUtil.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestCommittableToTableChangeConverter.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkRangeDistributionBucketing.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2DistributionMode.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergCommitter.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSink.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkBranch.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkCompact.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2Branch.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializerTestBase.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestCompareSchemasVisitor.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommittableSerializer.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommitter.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSinkPerf.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchema.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchemaId.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultAggregator.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultSerializer.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriter.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestEvolveSchemaVisitor.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestLRUCache.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestPartitionSpecEvolution.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestRowDataConverter.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableMetadataCache.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableSerializerCache.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableUpdater.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataDistributionUtil.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitionerSkew.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestStatisticsOrRecordTypeInformation.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedConverterBase.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedRow.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceInferParallelism.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java create mode 100644 flink/v2.0/flink/src/test/resources/META-INF/services/org.apache.flink.metrics.reporter.MetricReporterFactory create mode 100644 flink/v2.0/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory diff --git a/flink/v2.0/build.gradle b/flink/v2.0/build.gradle new file mode 100644 index 000000000000..dfbaa8ff4184 --- /dev/null +++ b/flink/v2.0/build.gradle @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +String flinkMajorVersion = '2.0' +String scalaVersion = System.getProperty("scalaVersion") != null ? System.getProperty("scalaVersion") : System.getProperty("defaultScalaVersion") + +project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { + + dependencies { + implementation project(path: ':iceberg-bundled-guava', configuration: 'shadow') + api project(':iceberg-api') + implementation project(':iceberg-common') + implementation project(':iceberg-core') + api project(':iceberg-data') + implementation project(':iceberg-orc') + implementation project(':iceberg-parquet') + implementation project(':iceberg-hive-metastore') + + compileOnly libs.flink20.avro + // for dropwizard histogram metrics implementation + compileOnly libs.flink20.metrics.dropwizard + compileOnly libs.flink20.streaming.java + compileOnly "${libs.flink20.streaming.java.get().module}:${libs.flink20.streaming.java.get().getVersion()}:tests" + compileOnly libs.flink20.table.api.java.bridge + compileOnly "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink20.get()}" + compileOnly libs.flink20.connector.base + compileOnly libs.flink20.connector.files + + compileOnly libs.hadoop3.hdfs + compileOnly libs.hadoop3.common + compileOnly(libs.hadoop3.minicluster) { + exclude group: 'org.apache.avro', module: 'avro' + } + + implementation(libs.parquet.avro) { + exclude group: 'org.apache.avro', module: 'avro' + // already shaded by Parquet + exclude group: 'it.unimi.dsi' + exclude group: 'org.codehaus.jackson' + } + + compileOnly libs.avro.avro + + implementation("${libs.orc.core.get().module}:${libs.versions.orc.get()}:nohive") { + exclude group: 'org.apache.hadoop' + exclude group: 'commons-lang' + // These artifacts are shaded and included in the orc-core fat jar + exclude group: 'com.google.protobuf', module: 'protobuf-java' + exclude group: 'org.apache.hive', module: 'hive-storage-api' + exclude group: 'org.slf4j' + } + + implementation libs.datasketches + + // for caching in DynamicSink + implementation libs.caffeine + + testImplementation libs.flink20.connector.test.utils + testImplementation libs.flink20.core + testImplementation libs.flink20.runtime + testImplementation(libs.flink20.test.utilsjunit) { + exclude group: 'junit' + } + testImplementation(libs.flink20.test.utils) { + exclude group: "org.apache.curator", module: 'curator-test' + exclude group: 'junit' + } + + testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') + + // By default, hive-exec is a fat/uber jar and it exports a guava library + // that's really old. We use the core classifier to be able to override our guava + // version. Luckily, hive-exec seems to work okay so far with this version of guava + // See: https://github.com/apache/hive/blob/master/ql/pom.xml#L911 for more context. + testImplementation("${libs.hive2.exec.get().module}:${libs.hive2.exec.get().getVersion()}:core") { + exclude group: 'org.apache.avro', module: 'avro' + exclude group: 'org.slf4j', module: 'slf4j-log4j12' + exclude group: 'org.pentaho' // missing dependency + exclude group: 'org.apache.hive', module: 'hive-llap-tez' + exclude group: 'org.apache.logging.log4j' + exclude group: 'com.google.protobuf', module: 'protobuf-java' + exclude group: 'org.apache.calcite' + exclude group: 'org.apache.calcite.avatica' + exclude group: 'com.google.code.findbugs', module: 'jsr305' + } + + testImplementation(libs.hive2.metastore) { + exclude group: 'org.apache.avro', module: 'avro' + exclude group: 'org.slf4j', module: 'slf4j-log4j12' + exclude group: 'org.pentaho' // missing dependency + exclude group: 'org.apache.hbase' + exclude group: 'org.apache.logging.log4j' + exclude group: 'co.cask.tephra' + exclude group: 'com.google.code.findbugs', module: 'jsr305' + exclude group: 'org.eclipse.jetty.aggregate', module: 'jetty-all' + exclude group: 'org.eclipse.jetty.orbit', module: 'javax.servlet' + exclude group: 'org.apache.parquet', module: 'parquet-hadoop-bundle' + exclude group: 'com.tdunning', module: 'json' + exclude group: 'javax.transaction', module: 'transaction-api' + exclude group: 'com.zaxxer', module: 'HikariCP' + exclude group: 'org.slf4j' + } + + testImplementation libs.awaitility + testImplementation libs.assertj.core + testImplementation libs.sqlite.jdbc + } + + test { + useJUnitPlatform() + } +} + +project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { + apply plugin: 'com.gradleup.shadow' + + tasks.jar.dependsOn tasks.shadowJar + + sourceSets { + integration { + java.srcDir "$projectDir/src/integration/java" + resources.srcDir "$projectDir/src/integration/resources" + } + } + + configurations { + implementation { + // included in Flink + exclude group: 'org.slf4j' + exclude group: 'org.apache.commons' + exclude group: 'commons-pool' + exclude group: 'commons-codec' + exclude group: 'org.xerial.snappy' + exclude group: 'javax.xml.bind' + exclude group: 'javax.annotation' + } + } + + dependencies { + implementation(project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}")) { + exclude group: 'org.apache.flink' + } + implementation project(':iceberg-aws') + implementation project(':iceberg-azure') + implementation(project(':iceberg-aliyun')) { + exclude group: 'edu.umd.cs.findbugs', module: 'findbugs' + exclude group: 'org.apache.httpcomponents', module: 'httpclient' + exclude group: 'commons-logging', module: 'commons-logging' + } + implementation project(':iceberg-gcp') + implementation(project(':iceberg-nessie')) { + exclude group: 'com.google.code.findbugs', module: 'jsr305' + } + + // for dropwizard histogram metrics implementation + implementation libs.flink20.metrics.dropwizard + + // for integration testing with the flink-runtime-jar + // all of those dependencies are required because the integration test extends FlinkTestBase + integrationCompileOnly project(':iceberg-api') + integrationImplementation libs.assertj.core + integrationImplementation project(path: ":iceberg-flink:iceberg-flink-${flinkMajorVersion}", configuration: "testArtifacts") + integrationImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') + integrationImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') + integrationImplementation(libs.flink20.test.utils) { + exclude group: "org.apache.curator", module: 'curator-test' + exclude group: 'junit' + } + + integrationImplementation libs.flink20.table.api.java.bridge + integrationImplementation "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink20.get()}" + + integrationImplementation libs.hadoop3.common + integrationImplementation libs.hadoop3.hdfs + integrationImplementation(libs.hadoop3.minicluster) { + exclude group: 'org.apache.avro', module: 'avro' + } + + integrationImplementation(libs.hive2.metastore) { + exclude group: 'org.apache.avro', module: 'avro' + exclude group: 'org.slf4j', module: 'slf4j-log4j12' + exclude group: 'org.pentaho' // missing dependency + exclude group: 'org.apache.hbase' + exclude group: 'org.apache.logging.log4j' + exclude group: 'co.cask.tephra' + exclude group: 'com.google.code.findbugs', module: 'jsr305' + exclude group: 'org.eclipse.jetty.aggregate', module: 'jetty-all' + exclude group: 'org.eclipse.jetty.orbit', module: 'javax.servlet' + exclude group: 'org.apache.parquet', module: 'parquet-hadoop-bundle' + exclude group: 'com.tdunning', module: 'json' + exclude group: 'javax.transaction', module: 'transaction-api' + exclude group: 'com.zaxxer', module: 'HikariCP' + exclude group: 'org.slf4j' + } + + integrationImplementation("${libs.hive2.exec.get().module}:${libs.hive2.exec.get().getVersion()}:core") { + exclude group: 'org.apache.avro', module: 'avro' + exclude group: 'org.slf4j', module: 'slf4j-log4j12' + exclude group: 'org.pentaho' // missing dependency + exclude group: 'org.apache.hive', module: 'hive-llap-tez' + exclude group: 'org.apache.logging.log4j' + exclude group: 'com.google.protobuf', module: 'protobuf-java' + exclude group: 'org.apache.calcite' + exclude group: 'org.apache.calcite.avatica' + exclude group: 'com.google.code.findbugs', module: 'jsr305' + } + } + + shadowJar { + configurations = [project.configurations.runtimeClasspath] + + zip64 true + + // include the LICENSE and NOTICE files for the shaded Jar + from(projectDir) { + include 'LICENSE' + include 'NOTICE' + } + + // Relocate dependencies to avoid conflicts + relocate 'org.apache.avro', 'org.apache.iceberg.shaded.org.apache.avro' + relocate 'org.apache.parquet', 'org.apache.iceberg.shaded.org.apache.parquet' + relocate 'com.google.errorprone', 'org.apache.iceberg.shaded.com.google.errorprone' + relocate 'com.google.flatbuffers', 'org.apache.iceberg.shaded.com.google.flatbuffers' + relocate 'com.fasterxml', 'org.apache.iceberg.shaded.com.fasterxml' + relocate 'com.github.benmanes', 'org.apache.iceberg.shaded.com.github.benmanes' + relocate 'org.checkerframework', 'org.apache.iceberg.shaded.org.checkerframework' + relocate 'shaded.parquet', 'org.apache.iceberg.shaded.org.apache.parquet.shaded' + relocate 'org.apache.orc', 'org.apache.iceberg.shaded.org.apache.orc' + relocate 'io.airlift', 'org.apache.iceberg.shaded.io.airlift' + relocate 'org.threeten.extra', 'org.apache.iceberg.shaded.org.threeten.extra' + relocate 'org.apache.hc.client5', 'org.apache.iceberg.shaded.org.apache.hc.client5' + relocate 'org.apache.hc.core5', 'org.apache.iceberg.shaded.org.apache.hc.core5' + + archiveClassifier.set(null) + } + + task integrationTest(type: Test) { + description = "Test Flink Runtime Jar against Flink ${flinkMajorVersion}" + group = "verification" + jvmArgs += project.property('extraJvmArgs') + testClassesDirs = sourceSets.integration.output.classesDirs + classpath = sourceSets.integration.runtimeClasspath + files(shadowJar.archiveFile.get().asFile.path) + inputs.file(shadowJar.archiveFile.get().asFile.path) + } + integrationTest.dependsOn shadowJar + check.dependsOn integrationTest + + jar { + enabled = false + } +} diff --git a/flink/v2.0/flink-runtime/LICENSE b/flink/v2.0/flink-runtime/LICENSE new file mode 100644 index 000000000000..9ca869edb59b --- /dev/null +++ b/flink/v2.0/flink-runtime/LICENSE @@ -0,0 +1,520 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +-------------------------------------------------------------------------------- + +This binary artifact contains Apache Avro. + +Copyright: 2014-2020 The Apache Software Foundation. +Home page: https://parquet.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains the Jackson JSON processor. + +Copyright: 2007-2020 Tatu Saloranta and other contributors +Home page: http://jackson.codehaus.org/ +License: http://www.apache.org/licenses/LICENSE-2.0.txt + +-------------------------------------------------------------------------------- + +This binary artifact contains Apache Parquet. + +Copyright: 2014-2020 The Apache Software Foundation. +Home page: https://parquet.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains Apache Thrift. + +Copyright: 2006-2010 The Apache Software Foundation. +Home page: https://thrift.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains fastutil. + +Copyright: 2002-2014 Sebastiano Vigna +Home page: http://fastutil.di.unimi.it/ +License: http://www.apache.org/licenses/LICENSE-2.0.html + +-------------------------------------------------------------------------------- + +This binary artifact contains Apache ORC. + +Copyright: 2013-2020 The Apache Software Foundation. +Home page: https://orc.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains Apache Hive's storage API via ORC. + +Copyright: 2013-2020 The Apache Software Foundation. +Home page: https://hive.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains Airlift Aircompressor. + +Copyright: 2011-2020 Aircompressor authors. +Home page: https://github.com/airlift/aircompressor +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains JetBrains annotations. + +Copyright: 2000-2020 JetBrains s.r.o. +Home page: https://github.com/JetBrains/java-annotations +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains Google Guava. + +Copyright: 2006-2020 The Guava Authors +Home page: https://github.com/google/guava +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains Google Error Prone Annotations. + +Copyright: Copyright 2011-2019 The Error Prone Authors +Home page: https://github.com/google/error-prone +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains checkerframework checker-qual Annotations. + +Copyright: 2004-2020 the Checker Framework developers +Home page: https://github.com/typetools/checker-framework +License: https://github.com/typetools/checker-framework/blob/master/LICENSE.txt (MIT license) + +License text: +| The annotations are licensed under the MIT License. (The text of this +| license appears below.) More specifically, all the parts of the Checker +| Framework that you might want to include with your own program use the +| MIT License. This is the checker-qual.jar file and all the files that +| appear in it: every file in a qual/ directory, plus utility files such +| as NullnessUtil.java, RegexUtil.java, SignednessUtil.java, etc. +| In addition, the cleanroom implementations of third-party annotations, +| which the Checker Framework recognizes as aliases for its own +| annotations, are licensed under the MIT License. +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in +| all copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +| THE SOFTWARE. + +-------------------------------------------------------------------------------- + +This binary artifact contains Caffeine by Ben Manes. + +Copyright: 2014-2020 Ben Manes and contributors +Home page: https://github.com/ben-manes/caffeine +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains Google protobuf. + +Copyright: 2008 Google Inc. +Home page: https://developers.google.com/protocol-buffers +License: https://github.com/protocolbuffers/protobuf/blob/master/LICENSE (BSD) + +License text: + +| Copyright 2008 Google Inc. All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are +| met: +| +| * Redistributions of source code must retain the above copyright +| notice, this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above +| copyright notice, this list of conditions and the following disclaimer +| in the documentation and/or other materials provided with the +| distribution. +| * Neither the name of Google Inc. nor the names of its +| contributors may be used to endorse or promote products derived from +| this software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +| +| Code generated by the Protocol Buffer compiler is owned by the owner +| of the input file used when generating it. This code is not +| standalone and requires a support library to be linked with it. This +| support library is itself covered by the above license. + +-------------------------------------------------------------------------------- + +This binary artifact contains ThreeTen. + +Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. +Home page: https://www.threeten.org/threeten-extra/ +License: https://github.com/ThreeTen/threeten-extra/blob/master/LICENSE.txt (BSD 3-clause) + +License text: + +| All rights reserved. +| +| * Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| * Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| +| * Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| * Neither the name of JSR-310 nor the names of its contributors +| may be used to endorse or promote products derived from this software +| without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +This binary artifact contains Project Nessie. + +Copyright: Copyright 2015-2025 Dremio Corporation +Home page: https://projectnessie.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary includes code from Apache Commons. + +* Core ArrayUtil. + +Copyright: 2020 The Apache Software Foundation +Home page: https://commons.apache.org/ +License: https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains Apache HttpComponents Client. + +Copyright: 1999-2022 The Apache Software Foundation. +Home page: https://hc.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product includes code from Apache HttpComponents Client. + +* retry and error handling logic in ExponentialHttpRequestRetryStrategy.java + +Copyright: 1999-2022 The Apache Software Foundation. +Home page: https://hc.apache.org/ +License: https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains failsafe. + +Copyright: Jonathan Halterman and friends +Home page: https://failsafe.dev/ +License: https://www.apache.org/licenses/LICENSE-2.0.html + +-------------------------------------------------------------------------------- + +This binary artifact contains Codehale Metrics. + +Copyright: (c) 2010-2013 Coda Hale, Yammer.com, 2014-2021 Dropwizard Team +Home page: https://github.com/dropwizard/metrics +License: https://www.apache.org/licenses/LICENSE-2.0.html + +-------------------------------------------------------------------------------- + +This binary artifact contains RoaringBitmap. + +Copyright: (c) 2013-... the RoaringBitmap authors +Home page: https://github.com/RoaringBitmap/RoaringBitmap +License: https://www.apache.org/licenses/LICENSE-2.0.html + +-------------------------------------------------------------------------------- + +This binary artifact contains Eclipse Microprofile OpenAPI. + +Copyright: Copyright (c) 2017 Contributors to the Eclipse Foundation +Home page: https://github.com/microprofile/microprofile-open-api +License: https://www.apache.org/licenses/LICENSE-2.0.html + +-------------------------------------------------------------------------------- + +This binary artifact contains Luben Zstd. + +Copyright: Copyright (c) 2015-present, Luben Karavelov/ All rights reserved. +Home page: https://github.com/luben/zstd-jni/ +License: BSD-2 License +License text: + +| Zstd-jni: JNI bindings to Zstd Library +| +| Copyright (c) 2015-present, Luben Karavelov/ All rights reserved. +| +| BSD License +| +| Redistribution and use in source and binary forms, with or without modification, +| are permitted provided that the following conditions are met: +| +| * Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| * Redistributions in binary form must reproduce the above copyright notice, this +| list of conditions and the following disclaimer in the documentation and/or +| other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +| ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +| (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +| LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +| ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/flink/v2.0/flink-runtime/NOTICE b/flink/v2.0/flink-runtime/NOTICE new file mode 100644 index 000000000000..0838a76b3473 --- /dev/null +++ b/flink/v2.0/flink-runtime/NOTICE @@ -0,0 +1,360 @@ + +Apache Iceberg +Copyright 2017-2025 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +-------------------------------------------------------------------------------- + +This binary artifact includes Airlift Aircompressor with the following in its +NOTICE file: + +| Snappy Copyright Notices +| ========================= +| +| * Copyright 2011 Dain Sundstrom +| * Copyright 2011, Google Inc. +| +| +| Snappy License +| =============== +| Copyright 2011, Google Inc. +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are +| met: +| +| * Redistributions of source code must retain the above copyright +| notice, this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above +| copyright notice, this list of conditions and the following disclaimer +| in the documentation and/or other materials provided with the +| distribution. +| * Neither the name of Google Inc. nor the names of its +| contributors may be used to endorse or promote products derived from +| this software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +This binary artifact includes Project Nessie with the following in its NOTICE +file: + +| Nessie +| Copyright 2015-2025 Dremio Corporation +| +| --------------------------------------- +| This project includes code from Apache Polaris (incubating), with the following in its NOTICE file: +| +| | Apache Polaris (incubating) +| | Copyright 2024 The Apache Software Foundation +| | +| | This product includes software developed at +| | The Apache Software Foundation (http://www.apache.org/). +| | +| | The initial code for the Polaris project was donated +| | to the ASF by Snowflake Inc. (https://www.snowflake.com/) copyright 2024. +| +| --------------------------------------- +| This project includes code from Netty, with the following in its NOTICE file: +| +| | The Netty Project +| | ================= +| | +| | Please visit the Netty web site for more information: +| | +| | * https://netty.io/ +| | +| | Copyright 2014 The Netty Project +| | +| | The Netty Project licenses this file to you under the Apache License, +| | version 2.0 (the "License"); you may not use this file except in compliance +| | with the License. You may obtain a copy of the License at: +| | +| | https://www.apache.org/licenses/LICENSE-2.0 +| | +| | Unless required by applicable law or agreed to in writing, software +| | distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +| | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +| | License for the specific language governing permissions and limitations +| | under the License. +| | +| | Also, please refer to each LICENSE..txt file, which is located in +| | the 'license' directory of the distribution file, for the license terms of the +| | components that this product depends on. +| | +| | ------------------------------------------------------------------------------- +| | This product contains the extensions to Java Collections Framework which has +| | been derived from the works by JSR-166 EG, Doug Lea, and Jason T. Greene: +| | +| | * LICENSE: +| | * license/LICENSE.jsr166y.txt (Public Domain) +| | * HOMEPAGE: +| | * http://gee.cs.oswego.edu/cgi-bin/viewcvs.cgi/jsr166/ +| | * http://viewvc.jboss.org/cgi-bin/viewvc.cgi/jbosscache/experimental/jsr166/ +| | +| | This product contains a modified version of Robert Harder's Public Domain +| | Base64 Encoder and Decoder, which can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.base64.txt (Public Domain) +| | * HOMEPAGE: +| | * http://iharder.sourceforge.net/current/java/base64/ +| | +| | This product contains a modified portion of 'Webbit', an event based +| | WebSocket and HTTP server, which can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.webbit.txt (BSD License) +| | * HOMEPAGE: +| | * https://github.com/joewalnes/webbit +| | +| | This product contains a modified portion of 'SLF4J', a simple logging +| | facade for Java, which can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.slf4j.txt (MIT License) +| | * HOMEPAGE: +| | * https://www.slf4j.org/ +| | +| | This product contains a modified portion of 'Apache Harmony', an open source +| | Java SE, which can be obtained at: +| | +| | * NOTICE: +| | * license/NOTICE.harmony.txt +| | * LICENSE: +| | * license/LICENSE.harmony.txt (Apache License 2.0) +| | * HOMEPAGE: +| | * https://archive.apache.org/dist/harmony/ +| | +| | This product contains a modified portion of 'jbzip2', a Java bzip2 compression +| | and decompression library written by Matthew J. Francis. It can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.jbzip2.txt (MIT License) +| | * HOMEPAGE: +| | * https://code.google.com/p/jbzip2/ +| | +| | This product contains a modified portion of 'libdivsufsort', a C API library to construct +| | the suffix array and the Burrows-Wheeler transformed string for any input string of +| | a constant-size alphabet written by Yuta Mori. It can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.libdivsufsort.txt (MIT License) +| | * HOMEPAGE: +| | * https://github.com/y-256/libdivsufsort +| | +| | This product contains a modified portion of Nitsan Wakart's 'JCTools', Java Concurrency Tools for the JVM, +| | which can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.jctools.txt (ASL2 License) +| | * HOMEPAGE: +| | * https://github.com/JCTools/JCTools +| | +| | This product optionally depends on 'JZlib', a re-implementation of zlib in +| | pure Java, which can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.jzlib.txt (BSD style License) +| | * HOMEPAGE: +| | * http://www.jcraft.com/jzlib/ +| | +| | This product optionally depends on 'Compress-LZF', a Java library for encoding and +| | decoding data in LZF format, written by Tatu Saloranta. It can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.compress-lzf.txt (Apache License 2.0) +| | * HOMEPAGE: +| | * https://github.com/ning/compress +| | +| | This product optionally depends on 'lz4', a LZ4 Java compression +| | and decompression library written by Adrien Grand. It can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.lz4.txt (Apache License 2.0) +| | * HOMEPAGE: +| | * https://github.com/jpountz/lz4-java +| | +| | This product optionally depends on 'lzma-java', a LZMA Java compression +| | and decompression library, which can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.lzma-java.txt (Apache License 2.0) +| | * HOMEPAGE: +| | * https://github.com/jponge/lzma-java +| | +| | This product optionally depends on 'zstd-jni', a zstd-jni Java compression +| | and decompression library, which can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.zstd-jni.txt (BSD) +| | * HOMEPAGE: +| | * https://github.com/luben/zstd-jni +| | +| | This product contains a modified portion of 'jfastlz', a Java port of FastLZ compression +| | and decompression library written by William Kinney. It can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.jfastlz.txt (MIT License) +| | * HOMEPAGE: +| | * https://code.google.com/p/jfastlz/ +| | +| | This product contains a modified portion of and optionally depends on 'Protocol Buffers', Google's data +| | interchange format, which can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.protobuf.txt (New BSD License) +| | * HOMEPAGE: +| | * https://github.com/google/protobuf +| | +| | This product optionally depends on 'Bouncy Castle Crypto APIs' to generate +| | a temporary self-signed X.509 certificate when the JVM does not provide the +| | equivalent functionality. It can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.bouncycastle.txt (MIT License) +| | * HOMEPAGE: +| | * https://www.bouncycastle.org/ +| | +| | This product optionally depends on 'Snappy', a compression library produced +| | by Google Inc, which can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.snappy.txt (New BSD License) +| | * HOMEPAGE: +| | * https://github.com/google/snappy +| | +| | This product optionally depends on 'JBoss Marshalling', an alternative Java +| | serialization API, which can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.jboss-marshalling.txt (Apache License 2.0) +| | * HOMEPAGE: +| | * https://github.com/jboss-remoting/jboss-marshalling +| | +| | This product optionally depends on 'Caliper', Google's micro- +| | benchmarking framework, which can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.caliper.txt (Apache License 2.0) +| | * HOMEPAGE: +| | * https://github.com/google/caliper +| | +| | This product optionally depends on 'Apache Commons Logging', a logging +| | framework, which can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.commons-logging.txt (Apache License 2.0) +| | * HOMEPAGE: +| | * https://commons.apache.org/logging/ +| | +| | This product optionally depends on 'Apache Log4J', a logging framework, which +| | can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.log4j.txt (Apache License 2.0) +| | * HOMEPAGE: +| | * https://logging.apache.org/log4j/ +| | +| | This product optionally depends on 'Aalto XML', an ultra-high performance +| | non-blocking XML processor, which can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.aalto-xml.txt (Apache License 2.0) +| | * HOMEPAGE: +| | * https://wiki.fasterxml.com/AaltoHome +| | +| | This product contains a modified version of 'HPACK', a Java implementation of +| | the HTTP/2 HPACK algorithm written by Twitter. It can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.hpack.txt (Apache License 2.0) +| | * HOMEPAGE: +| | * https://github.com/twitter/hpack +| | +| | This product contains a modified version of 'HPACK', a Java implementation of +| | the HTTP/2 HPACK algorithm written by Cory Benfield. It can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.hyper-hpack.txt (MIT License) +| | * HOMEPAGE: +| | * https://github.com/python-hyper/hpack/ +| | +| | This product contains a modified version of 'HPACK', a Java implementation of +| | the HTTP/2 HPACK algorithm written by Tatsuhiro Tsujikawa. It can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.nghttp2-hpack.txt (MIT License) +| | * HOMEPAGE: +| | * https://github.com/nghttp2/nghttp2/ +| | +| | This product contains a modified portion of 'Apache Commons Lang', a Java library +| | provides utilities for the java.lang API, which can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.commons-lang.txt (Apache License 2.0) +| | * HOMEPAGE: +| | * https://commons.apache.org/proper/commons-lang/ +| | +| | +| | This product contains the Maven wrapper scripts from 'Maven Wrapper', that provides an easy way to ensure a user has everything necessary to run the Maven build. +| | +| | * LICENSE: +| | * license/LICENSE.mvn-wrapper.txt (Apache License 2.0) +| | * HOMEPAGE: +| | * https://github.com/takari/maven-wrapper +| | +| | This product contains the dnsinfo.h header file, that provides a way to retrieve the system DNS configuration on MacOS. +| | This private header is also used by Apple's open source +| | mDNSResponder (https://opensource.apple.com/tarballs/mDNSResponder/). +| | +| | * LICENSE: +| | * license/LICENSE.dnsinfo.txt (Apple Public Source License 2.0) +| | * HOMEPAGE: +| | * https://www.opensource.apple.com/source/configd/configd-453.19/dnsinfo/dnsinfo.h +| | +| | This product optionally depends on 'Brotli4j', Brotli compression and +| | decompression for Java., which can be obtained at: +| | +| | * LICENSE: +| | * license/LICENSE.brotli4j.txt (Apache License 2.0) +| | * HOMEPAGE: +| | * https://github.com/hyperxpro/Brotli4j + +-------------------------------------------------------------------------------- + +This binary artifact includes Eclipse Microprofile OpenAPI with the following in its NOTICE file: + +| ========================================================================= +| == NOTICE file corresponding to section 4(d) of the Apache License, == +| == Version 2.0, in this case for MicroProfile OpenAPI == +| ========================================================================= +| +| The majority of this software were originally based on the following: +| * Swagger Core +| https://github.com/swagger-api/swagger-core +| under Apache License, v2.0 +| +| +| SPDXVersion: SPDX-2.1 +| PackageName: Eclipse MicroProfile +| PackageHomePage: http://www.eclipse.org/microprofile +| PackageLicenseDeclared: Apache-2.0 +| +| PackageCopyrightText: +| Arthur De Magalhaes arthurdm@ca.ibm.com +| diff --git a/flink/v2.0/flink-runtime/src/integration/java/org/apache/iceberg/flink/TestIcebergConnectorSmoke.java b/flink/v2.0/flink-runtime/src/integration/java/org/apache/iceberg/flink/TestIcebergConnectorSmoke.java new file mode 100644 index 000000000000..3ba3bb71a151 --- /dev/null +++ b/flink/v2.0/flink-runtime/src/integration/java/org/apache/iceberg/flink/TestIcebergConnectorSmoke.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +public class TestIcebergConnectorSmoke extends TestIcebergConnector {} diff --git a/flink/v2.0/flink/src/jmh/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordSerializerDeserializerBenchmark.java b/flink/v2.0/flink/src/jmh/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordSerializerDeserializerBenchmark.java new file mode 100644 index 000000000000..d7c3a7b32bc8 --- /dev/null +++ b/flink/v2.0/flink/src/jmh/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordSerializerDeserializerBenchmark.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.api.common.serialization.SerializerConfig; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.CatalogLoader; +import org.apache.iceberg.flink.RowDataConverter; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; + +@Fork(1) +@State(Scope.Benchmark) +@Warmup(iterations = 3) +@Measurement(iterations = 5) +@BenchmarkMode(Mode.SingleShotTime) +public class DynamicRecordSerializerDeserializerBenchmark { + private static final int SAMPLE_SIZE = 100_000; + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "name2", Types.StringType.get()), + Types.NestedField.required(3, "name3", Types.StringType.get()), + Types.NestedField.required(4, "name4", Types.StringType.get()), + Types.NestedField.required(5, "name5", Types.StringType.get()), + Types.NestedField.required(6, "name6", Types.StringType.get()), + Types.NestedField.required(7, "name7", Types.StringType.get()), + Types.NestedField.required(8, "name8", Types.StringType.get()), + Types.NestedField.required(9, "name9", Types.StringType.get())); + + private List rows = Lists.newArrayListWithExpectedSize(SAMPLE_SIZE); + private DynamicRecordInternalType type; + + public static void main(String[] args) throws RunnerException { + Options options = + new OptionsBuilder() + .include(DynamicRecordSerializerDeserializerBenchmark.class.getSimpleName()) + .build(); + new Runner(options).run(); + } + + @Setup + public void setupBenchmark() throws IOException { + List records = RandomGenericData.generate(SCHEMA, SAMPLE_SIZE, 1L); + this.rows = + records.stream() + .map( + r -> + new DynamicRecordInternal( + "t", + "main", + SCHEMA, + RowDataConverter.convert(SCHEMA, r), + PartitionSpec.unpartitioned(), + 1, + false, + Collections.emptySet())) + .collect(Collectors.toList()); + + File warehouse = Files.createTempFile("perf-bench", null).toFile(); + CatalogLoader catalogLoader = + CatalogLoader.hadoop( + "hadoop", + new Configuration(), + ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouse.getPath())); + this.type = new DynamicRecordInternalType(catalogLoader, true, 100); + } + + @Benchmark + @Threads(1) + public void testSerialize(Blackhole blackhole) throws IOException { + TypeSerializer serializer = + type.createSerializer((SerializerConfig) null); + DataOutputSerializer outputView = new DataOutputSerializer(1024); + for (int i = 0; i < SAMPLE_SIZE; ++i) { + serializer.serialize(rows.get(i), outputView); + } + } + + @Benchmark + @Threads(1) + public void testSerializeAndDeserialize(Blackhole blackhole) throws IOException { + TypeSerializer serializer = + type.createSerializer((SerializerConfig) null); + + DataOutputSerializer outputView = new DataOutputSerializer(1024); + for (int i = 0; i < SAMPLE_SIZE; ++i) { + serializer.serialize(rows.get(i), outputView); + serializer.deserialize(new DataInputDeserializer(outputView.getSharedBuffer())); + } + } +} diff --git a/flink/v2.0/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java b/flink/v2.0/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java new file mode 100644 index 000000000000..80a46ac530e1 --- /dev/null +++ b/flink/v2.0/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.concurrent.ThreadLocalRandom; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.SortOrderComparators; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +@Fork(1) +@State(Scope.Benchmark) +@Warmup(iterations = 3) +@Measurement(iterations = 5) +@BenchmarkMode(Mode.SingleShotTime) +public class MapRangePartitionerBenchmark { + + private static final int SAMPLE_SIZE = 100_000; + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "name2", Types.StringType.get()), + Types.NestedField.required(3, "name3", Types.StringType.get()), + Types.NestedField.required(4, "name4", Types.StringType.get()), + Types.NestedField.required(5, "name5", Types.StringType.get()), + Types.NestedField.required(6, "name6", Types.StringType.get()), + Types.NestedField.required(7, "name7", Types.StringType.get()), + Types.NestedField.required(8, "name8", Types.StringType.get()), + Types.NestedField.required(9, "name9", Types.StringType.get())); + + private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); + private static final Comparator SORT_ORDER_COMPARTOR = + SortOrderComparators.forSchema(SCHEMA, SORT_ORDER); + private static final SortKey SORT_KEY = new SortKey(SCHEMA, SORT_ORDER); + private static final int PARALLELISM = 100; + + private MapRangePartitioner partitioner; + private RowData[] rows; + + @Setup + public void setupBenchmark() { + NavigableMap weights = + DataDistributionUtil.longTailDistribution(100_000, 24, 240, 100, 2.0, 0.7); + Map mapStatistics = + DataDistributionUtil.mapStatisticsWithLongTailDistribution(weights, SORT_KEY); + + MapAssignment mapAssignment = + MapAssignment.fromKeyFrequency(PARALLELISM, mapStatistics, 0.0, SORT_ORDER_COMPARTOR); + this.partitioner = new MapRangePartitioner(SCHEMA, SORT_ORDER, mapAssignment); + + List keys = Lists.newArrayList(weights.keySet().iterator()); + long[] weightsCDF = DataDistributionUtil.computeCumulativeWeights(keys, weights); + long totalWeight = weightsCDF[weightsCDF.length - 1]; + + // pre-calculate the samples for benchmark run + this.rows = new GenericRowData[SAMPLE_SIZE]; + for (int i = 0; i < SAMPLE_SIZE; ++i) { + long weight = ThreadLocalRandom.current().nextLong(totalWeight); + int index = DataDistributionUtil.binarySearchIndex(weightsCDF, weight); + rows[i] = + GenericRowData.of( + keys.get(index), + DataDistributionUtil.randomString("name2-", 200), + DataDistributionUtil.randomString("name3-", 200), + DataDistributionUtil.randomString("name4-", 200), + DataDistributionUtil.randomString("name5-", 200), + DataDistributionUtil.randomString("name6-", 200), + DataDistributionUtil.randomString("name7-", 200), + DataDistributionUtil.randomString("name8-", 200), + DataDistributionUtil.randomString("name9-", 200)); + } + } + + @TearDown + public void tearDownBenchmark() {} + + @Benchmark + @Threads(1) + public void testPartitionerLongTailDistribution(Blackhole blackhole) { + for (int i = 0; i < SAMPLE_SIZE; ++i) { + blackhole.consume(partitioner.partition(rows[i], PARALLELISM)); + } + } +} diff --git a/flink/v2.0/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitionerBenchmark.java b/flink/v2.0/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitionerBenchmark.java new file mode 100644 index 000000000000..53a24cd8968a --- /dev/null +++ b/flink/v2.0/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitionerBenchmark.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Arrays; +import java.util.UUID; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.types.Types; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +@Fork(1) +@State(Scope.Benchmark) +@Warmup(iterations = 3) +@Measurement(iterations = 5) +@BenchmarkMode(Mode.SingleShotTime) +public class SketchRangePartitionerBenchmark { + + private static final int SAMPLE_SIZE = 100_000; + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.UUIDType.get()), + Types.NestedField.required(2, "name2", Types.StringType.get()), + Types.NestedField.required(3, "name3", Types.StringType.get()), + Types.NestedField.required(4, "name4", Types.StringType.get()), + Types.NestedField.required(5, "name5", Types.StringType.get()), + Types.NestedField.required(6, "name6", Types.StringType.get()), + Types.NestedField.required(7, "name7", Types.StringType.get()), + Types.NestedField.required(8, "name8", Types.StringType.get()), + Types.NestedField.required(9, "name9", Types.StringType.get())); + + private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); + private static final SortKey SORT_KEY = new SortKey(SCHEMA, SORT_ORDER); + private static final int PARALLELISM = 100; + + private SketchRangePartitioner partitioner; + private RowData[] rows; + + @Setup + public void setupBenchmark() { + UUID[] reservoir = DataDistributionUtil.reservoirSampleUUIDs(1_000_000, 100_000); + UUID[] rangeBound = DataDistributionUtil.rangeBoundSampleUUIDs(reservoir, PARALLELISM); + SortKey[] rangeBoundSortKeys = + Arrays.stream(rangeBound) + .map( + uuid -> { + SortKey sortKeyCopy = SORT_KEY.copy(); + sortKeyCopy.set(0, uuid); + return sortKeyCopy; + }) + .toArray(SortKey[]::new); + + this.partitioner = new SketchRangePartitioner(SCHEMA, SORT_ORDER, rangeBoundSortKeys); + + // pre-calculate the samples for benchmark run + this.rows = new GenericRowData[SAMPLE_SIZE]; + for (int i = 0; i < SAMPLE_SIZE; ++i) { + UUID uuid = UUID.randomUUID(); + Object uuidBytes = DataDistributionUtil.uuidBytes(uuid); + rows[i] = + GenericRowData.of( + uuidBytes, + DataDistributionUtil.randomString("name2-", 200), + DataDistributionUtil.randomString("name3-", 200), + DataDistributionUtil.randomString("name4-", 200), + DataDistributionUtil.randomString("name5-", 200), + DataDistributionUtil.randomString("name6-", 200), + DataDistributionUtil.randomString("name7-", 200), + DataDistributionUtil.randomString("name8-", 200), + DataDistributionUtil.randomString("name9-", 200)); + } + } + + @TearDown + public void tearDownBenchmark() {} + + @Benchmark + @Threads(1) + public void testPartitionerLongTailDistribution(Blackhole blackhole) { + for (int i = 0; i < SAMPLE_SIZE; ++i) { + blackhole.consume(partitioner.partition(rows[i], PARALLELISM)); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/flink/table/api/runtime/types/FlinkScalaKryoInstantiator.java b/flink/v2.0/flink/src/main/java/org/apache/flink/table/api/runtime/types/FlinkScalaKryoInstantiator.java new file mode 100644 index 000000000000..7318c147a1b6 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/flink/table/api/runtime/types/FlinkScalaKryoInstantiator.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.flink.table.api.runtime.types; + +/** + * Override Flink's internal FlinkScalaKryoInstantiator to avoid loading the Scala extensions for + * the KryoSerializer. This is a workaround until Kryo-related issues with the Scala extensions are + * fixed. See: https://issues.apache.org/jira/browse/FLINK-37546 + */ +public class FlinkScalaKryoInstantiator {} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java new file mode 100644 index 000000000000..18473bf4f190 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java @@ -0,0 +1,215 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.io.Serializable; +import java.util.Map; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.CatalogUtil; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.hadoop.HadoopCatalog; +import org.apache.iceberg.hadoop.SerializableConfiguration; +import org.apache.iceberg.hive.HiveCatalog; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.rest.RESTCatalog; + +/** Serializable loader to load an Iceberg {@link Catalog}. */ +public interface CatalogLoader extends Serializable, Cloneable { + + /** + * Create a new catalog with the provided properties. NOTICE: for flink, we may initialize the + * {@link CatalogLoader} at flink sql client side or job manager side, and then serialize this + * catalog loader to task manager, finally deserialize it and create a new catalog at task manager + * side. + * + * @return a newly created {@link Catalog} + */ + Catalog loadCatalog(); + + /** Clone a CatalogLoader. */ + @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) + CatalogLoader clone(); + + static CatalogLoader hadoop( + String name, Configuration hadoopConf, Map properties) { + return new HadoopCatalogLoader(name, hadoopConf, properties); + } + + static CatalogLoader hive(String name, Configuration hadoopConf, Map properties) { + return new HiveCatalogLoader(name, hadoopConf, properties); + } + + static CatalogLoader rest(String name, Configuration hadoopConf, Map properties) { + return new RESTCatalogLoader(name, hadoopConf, properties); + } + + static CatalogLoader custom( + String name, Map properties, Configuration hadoopConf, String impl) { + return new CustomCatalogLoader(name, properties, hadoopConf, impl); + } + + class HadoopCatalogLoader implements CatalogLoader { + private final String catalogName; + private final SerializableConfiguration hadoopConf; + private final String warehouseLocation; + private final Map properties; + + private HadoopCatalogLoader( + String catalogName, Configuration conf, Map properties) { + this.catalogName = catalogName; + this.hadoopConf = new SerializableConfiguration(conf); + this.warehouseLocation = properties.get(CatalogProperties.WAREHOUSE_LOCATION); + this.properties = Maps.newHashMap(properties); + } + + @Override + public Catalog loadCatalog() { + return CatalogUtil.loadCatalog( + HadoopCatalog.class.getName(), catalogName, properties, hadoopConf.get()); + } + + @Override + @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) + public CatalogLoader clone() { + return new HadoopCatalogLoader(catalogName, new Configuration(hadoopConf.get()), properties); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("catalogName", catalogName) + .add("warehouseLocation", warehouseLocation) + .toString(); + } + } + + class HiveCatalogLoader implements CatalogLoader { + private final String catalogName; + private final SerializableConfiguration hadoopConf; + private final String uri; + private final String warehouse; + private final int clientPoolSize; + private final Map properties; + + private HiveCatalogLoader( + String catalogName, Configuration conf, Map properties) { + this.catalogName = catalogName; + this.hadoopConf = new SerializableConfiguration(conf); + this.uri = properties.get(CatalogProperties.URI); + this.warehouse = properties.get(CatalogProperties.WAREHOUSE_LOCATION); + this.clientPoolSize = + properties.containsKey(CatalogProperties.CLIENT_POOL_SIZE) + ? Integer.parseInt(properties.get(CatalogProperties.CLIENT_POOL_SIZE)) + : CatalogProperties.CLIENT_POOL_SIZE_DEFAULT; + this.properties = Maps.newHashMap(properties); + } + + @Override + public Catalog loadCatalog() { + return CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), catalogName, properties, hadoopConf.get()); + } + + @Override + @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) + public CatalogLoader clone() { + return new HiveCatalogLoader(catalogName, new Configuration(hadoopConf.get()), properties); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("catalogName", catalogName) + .add("uri", uri) + .add("warehouse", warehouse) + .add("clientPoolSize", clientPoolSize) + .toString(); + } + } + + class RESTCatalogLoader implements CatalogLoader { + private final String catalogName; + private final SerializableConfiguration hadoopConf; + private final Map properties; + + private RESTCatalogLoader( + String catalogName, Configuration conf, Map properties) { + this.catalogName = catalogName; + this.hadoopConf = new SerializableConfiguration(conf); + this.properties = Maps.newHashMap(properties); + } + + @Override + public Catalog loadCatalog() { + return CatalogUtil.loadCatalog( + RESTCatalog.class.getName(), catalogName, properties, hadoopConf.get()); + } + + @Override + @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) + public CatalogLoader clone() { + return new RESTCatalogLoader(catalogName, new Configuration(hadoopConf.get()), properties); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("catalogName", catalogName) + .add("properties", properties) + .toString(); + } + } + + class CustomCatalogLoader implements CatalogLoader { + + private final SerializableConfiguration hadoopConf; + private final Map properties; + private final String name; + private final String impl; + + private CustomCatalogLoader( + String name, Map properties, Configuration conf, String impl) { + this.hadoopConf = new SerializableConfiguration(conf); + this.properties = Maps.newHashMap(properties); // wrap into a hashmap for serialization + this.name = name; + this.impl = + Preconditions.checkNotNull( + impl, "Cannot initialize custom Catalog, impl class name is null"); + } + + @Override + public Catalog loadCatalog() { + return CatalogUtil.loadCatalog(impl, name, properties, hadoopConf.get()); + } + + @Override + @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) + public CatalogLoader clone() { + return new CustomCatalogLoader(name, properties, new Configuration(hadoopConf.get()), impl); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this).add("name", name).add("impl", impl).toString(); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java new file mode 100644 index 000000000000..4bb235b811d0 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java @@ -0,0 +1,872 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.catalog.AbstractCatalog; +import org.apache.flink.table.catalog.CatalogBaseTable; +import org.apache.flink.table.catalog.CatalogDatabase; +import org.apache.flink.table.catalog.CatalogDatabaseImpl; +import org.apache.flink.table.catalog.CatalogFunction; +import org.apache.flink.table.catalog.CatalogPartition; +import org.apache.flink.table.catalog.CatalogPartitionSpec; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.ResolvedCatalogTable; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.catalog.TableChange; +import org.apache.flink.table.catalog.exceptions.CatalogException; +import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; +import org.apache.flink.table.catalog.exceptions.FunctionNotExistException; +import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; +import org.apache.flink.table.catalog.stats.CatalogColumnStatistics; +import org.apache.flink.table.catalog.stats.CatalogTableStatistics; +import org.apache.flink.table.expressions.Expression; +import org.apache.flink.table.factories.Factory; +import org.apache.flink.util.StringUtils; +import org.apache.iceberg.CachingCatalog; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.SupportsNamespaces; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.exceptions.AlreadyExistsException; +import org.apache.iceberg.exceptions.NamespaceNotEmptyException; +import org.apache.iceberg.exceptions.NoSuchNamespaceException; +import org.apache.iceberg.flink.util.FlinkAlterTableUtil; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.base.Splitter; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; + +/** + * A Flink Catalog implementation that wraps an Iceberg {@link Catalog}. + * + *

The mapping between Flink database and Iceberg namespace: Supplying a base namespace for a + * given catalog, so if you have a catalog that supports a 2-level namespace, you would supply the + * first level in the catalog configuration and the second level would be exposed as Flink + * databases. + * + *

The Iceberg table manages its partitions by itself. The partition of the Iceberg table is + * independent of the partition of Flink. + */ +@Internal +public class FlinkCatalog extends AbstractCatalog { + private final CatalogLoader catalogLoader; + private final Catalog icebergCatalog; + private final Namespace baseNamespace; + private final SupportsNamespaces asNamespaceCatalog; + private final Closeable closeable; + private final Map catalogProps; + private final boolean cacheEnabled; + + public FlinkCatalog( + String catalogName, + String defaultDatabase, + Namespace baseNamespace, + CatalogLoader catalogLoader, + Map catalogProps, + boolean cacheEnabled, + long cacheExpirationIntervalMs) { + super(catalogName, defaultDatabase); + this.catalogLoader = catalogLoader; + this.catalogProps = catalogProps; + this.baseNamespace = baseNamespace; + this.cacheEnabled = cacheEnabled; + + Catalog originalCatalog = catalogLoader.loadCatalog(); + icebergCatalog = + cacheEnabled + ? CachingCatalog.wrap(originalCatalog, cacheExpirationIntervalMs) + : originalCatalog; + asNamespaceCatalog = + originalCatalog instanceof SupportsNamespaces ? (SupportsNamespaces) originalCatalog : null; + closeable = originalCatalog instanceof Closeable ? (Closeable) originalCatalog : null; + + FlinkEnvironmentContext.init(); + } + + @Override + public void open() throws CatalogException {} + + @Override + public void close() throws CatalogException { + if (closeable != null) { + try { + closeable.close(); + } catch (IOException e) { + throw new CatalogException(e); + } + } + } + + public Catalog catalog() { + return icebergCatalog; + } + + /** Append a new level to the base namespace */ + private static Namespace appendLevel(Namespace baseNamespace, String newLevel) { + String[] namespace = new String[baseNamespace.levels().length + 1]; + System.arraycopy(baseNamespace.levels(), 0, namespace, 0, baseNamespace.levels().length); + namespace[baseNamespace.levels().length] = newLevel; + return Namespace.of(namespace); + } + + TableIdentifier toIdentifier(ObjectPath path) { + String objectName = path.getObjectName(); + List tableName = Splitter.on('$').splitToList(objectName); + + if (tableName.size() == 1) { + return TableIdentifier.of( + appendLevel(baseNamespace, path.getDatabaseName()), path.getObjectName()); + } else if (tableName.size() == 2 && MetadataTableType.from(tableName.get(1)) != null) { + return TableIdentifier.of( + appendLevel(appendLevel(baseNamespace, path.getDatabaseName()), tableName.get(0)), + tableName.get(1)); + } else { + throw new IllegalArgumentException("Illegal table name:" + objectName); + } + } + + @Override + public List listDatabases() throws CatalogException { + if (asNamespaceCatalog == null) { + return Collections.singletonList(getDefaultDatabase()); + } + + return asNamespaceCatalog.listNamespaces(baseNamespace).stream() + .map(n -> n.level(n.levels().length - 1)) + .collect(Collectors.toList()); + } + + @Override + public CatalogDatabase getDatabase(String databaseName) + throws DatabaseNotExistException, CatalogException { + if (asNamespaceCatalog == null) { + if (!getDefaultDatabase().equals(databaseName)) { + throw new DatabaseNotExistException(getName(), databaseName); + } else { + return new CatalogDatabaseImpl(Maps.newHashMap(), ""); + } + } else { + try { + Map metadata = + Maps.newHashMap( + asNamespaceCatalog.loadNamespaceMetadata(appendLevel(baseNamespace, databaseName))); + String comment = metadata.remove("comment"); + return new CatalogDatabaseImpl(metadata, comment); + } catch (NoSuchNamespaceException e) { + throw new DatabaseNotExistException(getName(), databaseName, e); + } + } + } + + @Override + public boolean databaseExists(String databaseName) throws CatalogException { + try { + getDatabase(databaseName); + return true; + } catch (DatabaseNotExistException ignore) { + return false; + } + } + + @Override + public void createDatabase(String name, CatalogDatabase database, boolean ignoreIfExists) + throws DatabaseAlreadyExistException, CatalogException { + createDatabase( + name, mergeComment(database.getProperties(), database.getComment()), ignoreIfExists); + } + + private void createDatabase( + String databaseName, Map metadata, boolean ignoreIfExists) + throws DatabaseAlreadyExistException, CatalogException { + if (asNamespaceCatalog != null) { + try { + asNamespaceCatalog.createNamespace(appendLevel(baseNamespace, databaseName), metadata); + } catch (AlreadyExistsException e) { + if (!ignoreIfExists) { + throw new DatabaseAlreadyExistException(getName(), databaseName, e); + } + } + } else { + throw new UnsupportedOperationException( + "Namespaces are not supported by catalog: " + getName()); + } + } + + private Map mergeComment(Map metadata, String comment) { + Map ret = Maps.newHashMap(metadata); + if (metadata.containsKey("comment")) { + throw new CatalogException("Database properties should not contain key: 'comment'."); + } + + if (!StringUtils.isNullOrWhitespaceOnly(comment)) { + ret.put("comment", comment); + } + return ret; + } + + @Override + public void dropDatabase(String name, boolean ignoreIfNotExists, boolean cascade) + throws DatabaseNotExistException, DatabaseNotEmptyException, CatalogException { + if (asNamespaceCatalog != null) { + try { + boolean success = asNamespaceCatalog.dropNamespace(appendLevel(baseNamespace, name)); + if (!success && !ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), name); + } + } catch (NoSuchNamespaceException e) { + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), name, e); + } + } catch (NamespaceNotEmptyException e) { + throw new DatabaseNotEmptyException(getName(), name, e); + } + } else { + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), name); + } + } + } + + @Override + public void alterDatabase(String name, CatalogDatabase newDatabase, boolean ignoreIfNotExists) + throws DatabaseNotExistException, CatalogException { + if (asNamespaceCatalog != null) { + Namespace namespace = appendLevel(baseNamespace, name); + Map updates = Maps.newHashMap(); + Set removals = Sets.newHashSet(); + + try { + Map oldProperties = asNamespaceCatalog.loadNamespaceMetadata(namespace); + Map newProperties = + mergeComment(newDatabase.getProperties(), newDatabase.getComment()); + + for (String key : oldProperties.keySet()) { + if (!newProperties.containsKey(key)) { + removals.add(key); + } + } + + for (Map.Entry entry : newProperties.entrySet()) { + if (!entry.getValue().equals(oldProperties.get(entry.getKey()))) { + updates.put(entry.getKey(), entry.getValue()); + } + } + + if (!updates.isEmpty()) { + asNamespaceCatalog.setProperties(namespace, updates); + } + + if (!removals.isEmpty()) { + asNamespaceCatalog.removeProperties(namespace, removals); + } + + } catch (NoSuchNamespaceException e) { + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), name, e); + } + } + } else { + if (getDefaultDatabase().equals(name)) { + throw new CatalogException( + "Can not alter the default database when the iceberg catalog doesn't support namespaces."); + } + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), name); + } + } + } + + @Override + public List listTables(String databaseName) + throws DatabaseNotExistException, CatalogException { + try { + return icebergCatalog.listTables(appendLevel(baseNamespace, databaseName)).stream() + .map(TableIdentifier::name) + .collect(Collectors.toList()); + } catch (NoSuchNamespaceException e) { + throw new DatabaseNotExistException(getName(), databaseName, e); + } + } + + @Override + public CatalogTable getTable(ObjectPath tablePath) + throws TableNotExistException, CatalogException { + Table table = loadIcebergTable(tablePath); + + // Flink's CREATE TABLE LIKE clause relies on properties sent back here to create new table. + // Inorder to create such table in non iceberg catalog, we need to send across catalog + // properties also. + // As Flink API accepts only Map for props, here we are serializing catalog + // props as json string to distinguish between catalog and table properties in createTable. + String srcCatalogProps = + FlinkCreateTableOptions.toJson( + getName(), tablePath.getDatabaseName(), tablePath.getObjectName(), catalogProps); + + Map tableProps = table.properties(); + if (tableProps.containsKey(FlinkCreateTableOptions.CONNECTOR_PROPS_KEY) + || tableProps.containsKey(FlinkCreateTableOptions.SRC_CATALOG_PROPS_KEY)) { + throw new IllegalArgumentException( + String.format( + "Source table %s contains one/all of the reserved property keys: %s, %s.", + tablePath, + FlinkCreateTableOptions.CONNECTOR_PROPS_KEY, + FlinkCreateTableOptions.SRC_CATALOG_PROPS_KEY)); + } + + ImmutableMap.Builder mergedProps = ImmutableMap.builder(); + mergedProps.put( + FlinkCreateTableOptions.CONNECTOR_PROPS_KEY, FlinkDynamicTableFactory.FACTORY_IDENTIFIER); + mergedProps.put(FlinkCreateTableOptions.SRC_CATALOG_PROPS_KEY, srcCatalogProps); + mergedProps.putAll(tableProps); + + return toCatalogTableWithProps(table, mergedProps.build()); + } + + private Table loadIcebergTable(ObjectPath tablePath) throws TableNotExistException { + try { + Table table = icebergCatalog.loadTable(toIdentifier(tablePath)); + if (cacheEnabled) { + table.refresh(); + } + + return table; + } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { + throw new TableNotExistException(getName(), tablePath, e); + } + } + + @Override + public boolean tableExists(ObjectPath tablePath) throws CatalogException { + return icebergCatalog.tableExists(toIdentifier(tablePath)); + } + + @Override + public void dropTable(ObjectPath tablePath, boolean ignoreIfNotExists) + throws TableNotExistException, CatalogException { + try { + icebergCatalog.dropTable(toIdentifier(tablePath)); + } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { + if (!ignoreIfNotExists) { + throw new TableNotExistException(getName(), tablePath, e); + } + } + } + + @Override + public void renameTable(ObjectPath tablePath, String newTableName, boolean ignoreIfNotExists) + throws TableNotExistException, TableAlreadyExistException, CatalogException { + try { + icebergCatalog.renameTable( + toIdentifier(tablePath), + toIdentifier(new ObjectPath(tablePath.getDatabaseName(), newTableName))); + } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { + if (!ignoreIfNotExists) { + throw new TableNotExistException(getName(), tablePath, e); + } + } catch (AlreadyExistsException e) { + throw new TableAlreadyExistException(getName(), tablePath, e); + } + } + + @Override + public void createTable(ObjectPath tablePath, CatalogBaseTable table, boolean ignoreIfExists) + throws CatalogException, TableAlreadyExistException { + // Creating Iceberg table using connector is allowed only when table is created using LIKE + if (Objects.equals( + table.getOptions().get(FlinkCreateTableOptions.CONNECTOR_PROPS_KEY), + FlinkDynamicTableFactory.FACTORY_IDENTIFIER) + && table.getOptions().get(FlinkCreateTableOptions.SRC_CATALOG_PROPS_KEY) == null) { + throw new IllegalArgumentException( + "Cannot create the table with 'connector'='iceberg' table property in " + + "an iceberg catalog, Please create table with 'connector'='iceberg' property in a non-iceberg catalog or " + + "create table without 'connector'='iceberg' related properties in an iceberg table."); + } + + Preconditions.checkArgument(table instanceof ResolvedCatalogTable, "table should be resolved"); + createIcebergTable(tablePath, (ResolvedCatalogTable) table, ignoreIfExists); + } + + void createIcebergTable(ObjectPath tablePath, ResolvedCatalogTable table, boolean ignoreIfExists) + throws CatalogException, TableAlreadyExistException { + validateFlinkTable(table); + + Schema icebergSchema = FlinkSchemaUtil.convert(table.getResolvedSchema()); + PartitionSpec spec = toPartitionSpec(table.getPartitionKeys(), icebergSchema); + ImmutableMap.Builder properties = ImmutableMap.builder(); + String location = null; + for (Map.Entry entry : table.getOptions().entrySet()) { + if (!isReservedProperty(entry.getKey())) { + properties.put(entry.getKey(), entry.getValue()); + } else { + // Filtering reserved properties like catalog properties(added to support CREATE TABLE LIKE + // in getTable()), location and not persisting on table properties. + if (FlinkCreateTableOptions.LOCATION_KEY.equalsIgnoreCase(entry.getKey())) { + location = entry.getValue(); + } + } + } + + try { + icebergCatalog.createTable( + toIdentifier(tablePath), icebergSchema, spec, location, properties.build()); + } catch (AlreadyExistsException e) { + if (!ignoreIfExists) { + throw new TableAlreadyExistException(getName(), tablePath, e); + } + } + } + + private boolean isReservedProperty(String prop) { + return FlinkCreateTableOptions.LOCATION_KEY.equalsIgnoreCase(prop) + || FlinkCreateTableOptions.CONNECTOR_PROPS_KEY.equalsIgnoreCase(prop) + || FlinkCreateTableOptions.SRC_CATALOG_PROPS_KEY.equalsIgnoreCase(prop); + } + + private static void validateTableSchemaAndPartition(CatalogTable ct1, CatalogTable ct2) { + if (!Objects.equals(ct1.getUnresolvedSchema(), ct2.getUnresolvedSchema())) { + throw new UnsupportedOperationException( + "Altering schema is not supported in the old alterTable API. " + + "To alter schema, use the other alterTable API and provide a list of TableChange's."); + } + + validateTablePartition(ct1, ct2); + } + + private static void validateTablePartition(CatalogTable ct1, CatalogTable ct2) { + if (!ct1.getPartitionKeys().equals(ct2.getPartitionKeys())) { + throw new UnsupportedOperationException("Altering partition keys is not supported yet."); + } + } + + /** + * This alterTable API only supports altering table properties. + * + *

Support for adding/removing/renaming columns cannot be done by comparing CatalogTable + * instances, unless the Flink schema contains Iceberg column IDs. + * + *

To alter columns, use the other alterTable API and provide a list of TableChange's. + * + * @param tablePath path of the table or view to be modified + * @param newTable the new table definition + * @param ignoreIfNotExists flag to specify behavior when the table or view does not exist: if set + * to false, throw an exception, if set to true, do nothing. + * @throws CatalogException in case of any runtime exception + * @throws TableNotExistException if the table does not exist + */ + @Override + public void alterTable(ObjectPath tablePath, CatalogBaseTable newTable, boolean ignoreIfNotExists) + throws CatalogException, TableNotExistException { + validateFlinkTable(newTable); + + Table icebergTable; + try { + icebergTable = loadIcebergTable(tablePath); + } catch (TableNotExistException e) { + if (!ignoreIfNotExists) { + throw e; + } else { + return; + } + } + + CatalogTable table = toCatalogTable(icebergTable); + validateTableSchemaAndPartition(table, (CatalogTable) newTable); + + Map oldProperties = table.getOptions(); + Map setProperties = Maps.newHashMap(); + + String setLocation = null; + String setSnapshotId = null; + String pickSnapshotId = null; + + for (Map.Entry entry : newTable.getOptions().entrySet()) { + String key = entry.getKey(); + String value = entry.getValue(); + + if (Objects.equals(value, oldProperties.get(key))) { + continue; + } + + if (FlinkCreateTableOptions.LOCATION_KEY.equalsIgnoreCase(key)) { + setLocation = value; + } else if ("current-snapshot-id".equalsIgnoreCase(key)) { + setSnapshotId = value; + } else if ("cherry-pick-snapshot-id".equalsIgnoreCase(key)) { + pickSnapshotId = value; + } else { + setProperties.put(key, value); + } + } + + oldProperties + .keySet() + .forEach( + k -> { + if (!newTable.getOptions().containsKey(k)) { + setProperties.put(k, null); + } + }); + + FlinkAlterTableUtil.commitChanges( + icebergTable, setLocation, setSnapshotId, pickSnapshotId, setProperties); + } + + @Override + public void alterTable( + ObjectPath tablePath, + CatalogBaseTable newTable, + List tableChanges, + boolean ignoreIfNotExists) + throws TableNotExistException, CatalogException { + validateFlinkTable(newTable); + + Table icebergTable; + try { + icebergTable = loadIcebergTable(tablePath); + } catch (TableNotExistException e) { + if (!ignoreIfNotExists) { + throw e; + } else { + return; + } + } + + // Does not support altering partition yet. + validateTablePartition(toCatalogTable(icebergTable), (CatalogTable) newTable); + + String setLocation = null; + String setSnapshotId = null; + String cherrypickSnapshotId = null; + + List propertyChanges = Lists.newArrayList(); + List schemaChanges = Lists.newArrayList(); + for (TableChange change : tableChanges) { + if (change instanceof TableChange.SetOption) { + TableChange.SetOption set = (TableChange.SetOption) change; + + if (FlinkCreateTableOptions.LOCATION_KEY.equalsIgnoreCase(set.getKey())) { + setLocation = set.getValue(); + } else if ("current-snapshot-id".equalsIgnoreCase(set.getKey())) { + setSnapshotId = set.getValue(); + } else if ("cherry-pick-snapshot-id".equalsIgnoreCase(set.getKey())) { + cherrypickSnapshotId = set.getValue(); + } else { + propertyChanges.add(change); + } + } else if (change instanceof TableChange.ResetOption) { + propertyChanges.add(change); + } else { + schemaChanges.add(change); + } + } + + FlinkAlterTableUtil.commitChanges( + icebergTable, + setLocation, + setSnapshotId, + cherrypickSnapshotId, + schemaChanges, + propertyChanges); + } + + private static void validateFlinkTable(CatalogBaseTable table) { + Preconditions.checkArgument( + table instanceof CatalogTable, "The Table should be a CatalogTable."); + + org.apache.flink.table.api.Schema schema = table.getUnresolvedSchema(); + schema + .getColumns() + .forEach( + column -> { + if (!FlinkCompatibilityUtil.isPhysicalColumn(column)) { + throw new UnsupportedOperationException( + "Creating table with computed columns is not supported yet."); + } + }); + + if (!schema.getWatermarkSpecs().isEmpty()) { + throw new UnsupportedOperationException( + "Creating table with watermark specs is not supported yet."); + } + } + + private static PartitionSpec toPartitionSpec(List partitionKeys, Schema icebergSchema) { + PartitionSpec.Builder builder = PartitionSpec.builderFor(icebergSchema); + partitionKeys.forEach(builder::identity); + return builder.build(); + } + + private static List toPartitionKeys(PartitionSpec spec, Schema icebergSchema) { + ImmutableList.Builder partitionKeysBuilder = ImmutableList.builder(); + for (PartitionField field : spec.fields()) { + if (field.transform().isIdentity()) { + partitionKeysBuilder.add(icebergSchema.findColumnName(field.sourceId())); + } else { + // Not created by Flink SQL. + // For compatibility with iceberg tables, return empty. + // TODO modify this after Flink support partition transform. + return Collections.emptyList(); + } + } + return partitionKeysBuilder.build(); + } + + static CatalogTable toCatalogTableWithProps(Table table, Map props) { + ResolvedSchema resolvedSchema = FlinkSchemaUtil.toResolvedSchema(table.schema()); + List partitionKeys = toPartitionKeys(table.spec(), table.schema()); + + // NOTE: We can not create a IcebergCatalogTable extends CatalogTable, because Flink optimizer + // may use DefaultCatalogTable to copy a new catalog table. + // Let's re-loading table from Iceberg catalog when creating source/sink operators. + return CatalogTable.newBuilder() + .schema( + org.apache.flink.table.api.Schema.newBuilder() + .fromResolvedSchema(resolvedSchema) + .build()) + .partitionKeys(partitionKeys) + .options(props) + .build(); + } + + static CatalogTable toCatalogTable(Table table) { + return toCatalogTableWithProps(table, table.properties()); + } + + @Override + public Optional getFactory() { + return Optional.of(new FlinkDynamicTableFactory(this)); + } + + CatalogLoader getCatalogLoader() { + return catalogLoader; + } + + // ------------------------------ Unsupported methods + // --------------------------------------------- + + @Override + public List listViews(String databaseName) throws CatalogException { + return Collections.emptyList(); + } + + @Override + public CatalogPartition getPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public boolean partitionExists(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void createPartition( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogPartition partition, + boolean ignoreIfExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void dropPartition( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec, boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void alterPartition( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogPartition newPartition, + boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public List listFunctions(String dbName) throws CatalogException { + return Collections.emptyList(); + } + + @Override + public CatalogFunction getFunction(ObjectPath functionPath) + throws FunctionNotExistException, CatalogException { + throw new FunctionNotExistException(getName(), functionPath); + } + + @Override + public boolean functionExists(ObjectPath functionPath) throws CatalogException { + return false; + } + + @Override + public void createFunction( + ObjectPath functionPath, CatalogFunction function, boolean ignoreIfExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void alterFunction( + ObjectPath functionPath, CatalogFunction newFunction, boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void dropFunction(ObjectPath functionPath, boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void alterTableStatistics( + ObjectPath tablePath, CatalogTableStatistics tableStatistics, boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void alterTableColumnStatistics( + ObjectPath tablePath, CatalogColumnStatistics columnStatistics, boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void alterPartitionStatistics( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogTableStatistics partitionStatistics, + boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void alterPartitionColumnStatistics( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogColumnStatistics columnStatistics, + boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public List listPartitions(ObjectPath tablePath) + throws TableNotExistException, TableNotPartitionedException, CatalogException { + Table table = loadIcebergTable(tablePath); + + if (table.spec().isUnpartitioned()) { + throw new TableNotPartitionedException(icebergCatalog.name(), tablePath); + } + + Set set = Sets.newHashSet(); + try (CloseableIterable tasks = table.newScan().planFiles()) { + for (DataFile dataFile : CloseableIterable.transform(tasks, FileScanTask::file)) { + Map map = Maps.newHashMap(); + StructLike structLike = dataFile.partition(); + PartitionSpec spec = table.specs().get(dataFile.specId()); + for (int i = 0; i < structLike.size(); i++) { + map.put(spec.fields().get(i).name(), String.valueOf(structLike.get(i, Object.class))); + } + set.add(new CatalogPartitionSpec(map)); + } + } catch (IOException e) { + throw new CatalogException( + String.format("Failed to list partitions of table %s", tablePath), e); + } + + return Lists.newArrayList(set); + } + + @Override + public List listPartitions( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public List listPartitionsByFilter( + ObjectPath tablePath, List filters) throws CatalogException { + throw new UnsupportedOperationException(); + } + + // After partition pruning and filter push down, the statistics have become very inaccurate, so + // the statistics from + // here are of little significance. + // Flink will support something like SupportsReportStatistics in future. + + @Override + public CatalogTableStatistics getTableStatistics(ObjectPath tablePath) throws CatalogException { + return CatalogTableStatistics.UNKNOWN; + } + + @Override + public CatalogColumnStatistics getTableColumnStatistics(ObjectPath tablePath) + throws CatalogException { + return CatalogColumnStatistics.UNKNOWN; + } + + @Override + public CatalogTableStatistics getPartitionStatistics( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { + return CatalogTableStatistics.UNKNOWN; + } + + @Override + public CatalogColumnStatistics getPartitionColumnStatistics( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { + return CatalogColumnStatistics.UNKNOWN; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java new file mode 100644 index 000000000000..33cbc92ddeec --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.GlobalConfiguration; +import org.apache.flink.runtime.util.HadoopUtils; +import org.apache.flink.table.catalog.Catalog; +import org.apache.flink.table.factories.CatalogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.base.Strings; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.util.PropertyUtil; + +/** + * A Flink Catalog factory implementation that creates {@link FlinkCatalog}. + * + *

This supports the following catalog configuration options: + * + *

    + *
  • type - Flink catalog factory key, should be "iceberg" + *
  • catalog-type - iceberg catalog type, "hive", "hadoop" or "rest" + *
  • uri - the Hive Metastore URI (Hive catalog only) + *
  • clients - the Hive Client Pool Size (Hive catalog only) + *
  • warehouse - the warehouse path (Hadoop catalog only) + *
  • default-database - a database name to use as the default + *
  • base-namespace - a base namespace as the prefix for all databases (Hadoop + * catalog only) + *
  • cache-enabled - whether to enable catalog cache + *
+ * + *

To use a custom catalog that is not a Hive or Hadoop catalog, extend this class and override + * {@link #createCatalogLoader(String, Map, Configuration)}. + */ +public class FlinkCatalogFactory implements CatalogFactory { + + public static final String FACTORY_IDENTIFIER = "iceberg"; + + // Can not just use "type", it conflicts with CATALOG_TYPE. + public static final String ICEBERG_CATALOG_TYPE = "catalog-type"; + public static final String ICEBERG_CATALOG_TYPE_HADOOP = "hadoop"; + public static final String ICEBERG_CATALOG_TYPE_HIVE = "hive"; + public static final String ICEBERG_CATALOG_TYPE_REST = "rest"; + + public static final String HIVE_CONF_DIR = "hive-conf-dir"; + public static final String HADOOP_CONF_DIR = "hadoop-conf-dir"; + public static final String DEFAULT_DATABASE = "default-database"; + public static final String DEFAULT_DATABASE_NAME = "default"; + public static final String DEFAULT_CATALOG_NAME = "default_catalog"; + public static final String BASE_NAMESPACE = "base-namespace"; + + /** + * Create an Iceberg {@link org.apache.iceberg.catalog.Catalog} loader to be used by this Flink + * catalog adapter. + * + * @param name Flink's catalog name + * @param properties Flink's catalog properties + * @param hadoopConf Hadoop configuration for catalog + * @return an Iceberg catalog loader + */ + static CatalogLoader createCatalogLoader( + String name, Map properties, Configuration hadoopConf) { + String catalogImpl = properties.get(CatalogProperties.CATALOG_IMPL); + if (catalogImpl != null) { + String catalogType = properties.get(ICEBERG_CATALOG_TYPE); + Preconditions.checkArgument( + catalogType == null, + "Cannot create catalog %s, both catalog-type and catalog-impl are set: catalog-type=%s, catalog-impl=%s", + name, + catalogType, + catalogImpl); + return CatalogLoader.custom(name, properties, hadoopConf, catalogImpl); + } + + String catalogType = properties.getOrDefault(ICEBERG_CATALOG_TYPE, ICEBERG_CATALOG_TYPE_HIVE); + switch (catalogType.toLowerCase(Locale.ENGLISH)) { + case ICEBERG_CATALOG_TYPE_HIVE: + // The values of properties 'uri', 'warehouse', 'hive-conf-dir' are allowed to be null, in + // that case it will + // fallback to parse those values from hadoop configuration which is loaded from classpath. + String hiveConfDir = properties.get(HIVE_CONF_DIR); + String hadoopConfDir = properties.get(HADOOP_CONF_DIR); + Configuration newHadoopConf = mergeHiveConf(hadoopConf, hiveConfDir, hadoopConfDir); + return CatalogLoader.hive(name, newHadoopConf, properties); + + case ICEBERG_CATALOG_TYPE_HADOOP: + return CatalogLoader.hadoop(name, hadoopConf, properties); + + case ICEBERG_CATALOG_TYPE_REST: + return CatalogLoader.rest(name, hadoopConf, properties); + + default: + throw new UnsupportedOperationException( + "Unknown catalog-type: " + catalogType + " (Must be 'hive', 'hadoop' or 'rest')"); + } + } + + @Override + public String factoryIdentifier() { + return FACTORY_IDENTIFIER; + } + + @Override + public Set> requiredOptions() { + return ImmutableSet.>builder().build(); + } + + @Override + public Set> optionalOptions() { + return ImmutableSet.>builder().build(); + } + + @Override + public Catalog createCatalog(Context context) { + return createCatalog(context.getName(), context.getOptions(), clusterHadoopConf()); + } + + protected Catalog createCatalog( + String name, Map properties, Configuration hadoopConf) { + CatalogLoader catalogLoader = createCatalogLoader(name, properties, hadoopConf); + String defaultDatabase = properties.getOrDefault(DEFAULT_DATABASE, DEFAULT_DATABASE_NAME); + + Namespace baseNamespace = Namespace.empty(); + if (properties.containsKey(BASE_NAMESPACE)) { + baseNamespace = Namespace.of(properties.get(BASE_NAMESPACE).split("\\.")); + } + + boolean cacheEnabled = + PropertyUtil.propertyAsBoolean( + properties, CatalogProperties.CACHE_ENABLED, CatalogProperties.CACHE_ENABLED_DEFAULT); + + long cacheExpirationIntervalMs = + PropertyUtil.propertyAsLong( + properties, + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS_OFF); + Preconditions.checkArgument( + cacheExpirationIntervalMs != 0, + "%s is not allowed to be 0.", + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS); + + return new FlinkCatalog( + name, + defaultDatabase, + baseNamespace, + catalogLoader, + properties, + cacheEnabled, + cacheExpirationIntervalMs); + } + + private static Configuration mergeHiveConf( + Configuration hadoopConf, String hiveConfDir, String hadoopConfDir) { + Configuration newConf = new Configuration(hadoopConf); + if (!Strings.isNullOrEmpty(hiveConfDir)) { + Preconditions.checkState( + Files.exists(Paths.get(hiveConfDir, "hive-site.xml")), + "There should be a hive-site.xml file under the directory %s", + hiveConfDir); + newConf.addResource(new Path(hiveConfDir, "hive-site.xml")); + } else { + // If don't provide the hive-site.xml path explicitly, it will try to load resource from + // classpath. If still + // couldn't load the configuration file, then it will throw exception in HiveCatalog. + URL configFile = CatalogLoader.class.getClassLoader().getResource("hive-site.xml"); + if (configFile != null) { + newConf.addResource(configFile); + } + } + + if (!Strings.isNullOrEmpty(hadoopConfDir)) { + Preconditions.checkState( + Files.exists(Paths.get(hadoopConfDir, "hdfs-site.xml")), + "Failed to load Hadoop configuration: missing %s", + Paths.get(hadoopConfDir, "hdfs-site.xml")); + newConf.addResource(new Path(hadoopConfDir, "hdfs-site.xml")); + Preconditions.checkState( + Files.exists(Paths.get(hadoopConfDir, "core-site.xml")), + "Failed to load Hadoop configuration: missing %s", + Paths.get(hadoopConfDir, "core-site.xml")); + newConf.addResource(new Path(hadoopConfDir, "core-site.xml")); + } + + return newConf; + } + + public static Configuration clusterHadoopConf() { + return HadoopUtils.getHadoopConfiguration(GlobalConfiguration.loadConfiguration()); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java new file mode 100644 index 000000000000..e0672811cf5f --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java @@ -0,0 +1,297 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.time.Duration; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import org.apache.flink.annotation.Internal; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.util.TimeUtils; +import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +@Internal +public class FlinkConfParser { + + private final Map tableProperties; + private final Map options; + private final ReadableConfig readableConfig; + + public FlinkConfParser(Table table, Map options, ReadableConfig readableConfig) { + this.tableProperties = table.properties(); + this.options = options; + this.readableConfig = readableConfig; + } + + FlinkConfParser(Map options, ReadableConfig readableConfig) { + this.tableProperties = ImmutableMap.of(); + this.options = options; + this.readableConfig = readableConfig; + } + + public BooleanConfParser booleanConf() { + return new BooleanConfParser(); + } + + public IntConfParser intConf() { + return new IntConfParser(); + } + + public LongConfParser longConf() { + return new LongConfParser(); + } + + public DoubleConfParser doubleConf() { + return new DoubleConfParser(); + } + + public > EnumConfParser enumConfParser(Class enumClass) { + return new EnumConfParser<>(enumClass); + } + + public StringConfParser stringConf() { + return new StringConfParser(); + } + + public DurationConfParser durationConf() { + return new DurationConfParser(); + } + + public class BooleanConfParser extends ConfParser { + private Boolean defaultValue; + + @Override + protected BooleanConfParser self() { + return this; + } + + public BooleanConfParser defaultValue(boolean value) { + this.defaultValue = value; + return self(); + } + + public BooleanConfParser defaultValue(String value) { + this.defaultValue = Boolean.parseBoolean(value); + return self(); + } + + public boolean parse() { + Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); + return parse(Boolean::parseBoolean, defaultValue); + } + } + + public class IntConfParser extends ConfParser { + private Integer defaultValue; + + @Override + protected IntConfParser self() { + return this; + } + + public IntConfParser defaultValue(int value) { + this.defaultValue = value; + return self(); + } + + public int parse() { + Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); + return parse(Integer::parseInt, defaultValue); + } + + public Integer parseOptional() { + return parse(Integer::parseInt, null); + } + } + + public class LongConfParser extends ConfParser { + private Long defaultValue; + + @Override + protected LongConfParser self() { + return this; + } + + public LongConfParser defaultValue(long value) { + this.defaultValue = value; + return self(); + } + + public long parse() { + Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); + return parse(Long::parseLong, defaultValue); + } + + public Long parseOptional() { + return parse(Long::parseLong, null); + } + } + + public class DoubleConfParser extends ConfParser { + private Double defaultValue; + + @Override + protected DoubleConfParser self() { + return this; + } + + public DoubleConfParser defaultValue(double value) { + this.defaultValue = value; + return self(); + } + + public double parse() { + Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); + return parse(Double::parseDouble, defaultValue); + } + + public Double parseOptional() { + return parse(Double::parseDouble, null); + } + } + + public class StringConfParser extends ConfParser { + private String defaultValue; + + @Override + protected StringConfParser self() { + return this; + } + + public StringConfParser defaultValue(String value) { + this.defaultValue = value; + return self(); + } + + public String parse() { + Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); + return parse(Function.identity(), defaultValue); + } + + public String parseOptional() { + return parse(Function.identity(), null); + } + } + + public class EnumConfParser> extends ConfParser, E> { + private E defaultValue; + private final Class enumClass; + + EnumConfParser(Class enumClass) { + this.enumClass = enumClass; + } + + @Override + protected EnumConfParser self() { + return this; + } + + public EnumConfParser defaultValue(E value) { + this.defaultValue = value; + return self(); + } + + public E parse() { + Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); + return parse(s -> Enum.valueOf(enumClass, s), defaultValue); + } + + public E parseOptional() { + return parse(s -> Enum.valueOf(enumClass, s), null); + } + } + + public class DurationConfParser extends ConfParser { + private Duration defaultValue; + + @Override + protected DurationConfParser self() { + return this; + } + + public DurationConfParser defaultValue(Duration value) { + this.defaultValue = value; + return self(); + } + + public Duration parse() { + Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); + return parse(TimeUtils::parseDuration, defaultValue); + } + + public Duration parseOptional() { + return parse(TimeUtils::parseDuration, null); + } + } + + public abstract class ConfParser { + private final List optionNames = Lists.newArrayList(); + private String tablePropertyName; + private ConfigOption configOption; + + protected abstract ThisT self(); + + public ThisT option(String name) { + this.optionNames.add(name); + return self(); + } + + public ThisT flinkConfig(ConfigOption newConfigOption) { + this.configOption = newConfigOption; + return self(); + } + + public ThisT tableProperty(String name) { + this.tablePropertyName = name; + return self(); + } + + protected T parse(Function conversion, T defaultValue) { + if (!optionNames.isEmpty()) { + for (String optionName : optionNames) { + String optionValue = options.get(optionName); + if (optionValue != null) { + return conversion.apply(optionValue); + } + } + } + + if (configOption != null) { + T propertyValue = readableConfig.get(configOption); + if (propertyValue != null) { + return propertyValue; + } + } + + if (tablePropertyName != null) { + String propertyValue = tableProperties.get(tablePropertyName); + if (propertyValue != null) { + return conversion.apply(propertyValue); + } + } + + return defaultValue; + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java new file mode 100644 index 000000000000..97e2c70d348e --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.description.Description; +import org.apache.flink.configuration.description.TextElement; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.iceberg.flink.source.assigner.SplitAssignerType; +import org.apache.iceberg.util.ThreadPools; + +/** + * When constructing Flink Iceberg source via Java API, configs can be set in {@link Configuration} + * passed to source builder. E.g. + * + *

+ *   configuration.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, true);
+ *   FlinkSource.forRowData()
+ *       .flinkConf(configuration)
+ *       ...
+ * 
+ * + *

When using Flink SQL/table API, connector options can be set in Flink's {@link + * TableEnvironment}. + * + *

+ *   TableEnvironment tEnv = createTableEnv();
+ *   tEnv.getConfig()
+ *        .getConfiguration()
+ *        .setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, true);
+ * 
+ */ +public class FlinkConfigOptions { + + private FlinkConfigOptions() {} + + public static final ConfigOption TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM = + ConfigOptions.key("table.exec.iceberg.infer-source-parallelism") + .booleanType() + .defaultValue(true) + .withDescription( + "If is false, parallelism of source are set by config.\n" + + "If is true, source parallelism is inferred according to splits number.\n"); + + public static final ConfigOption TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX = + ConfigOptions.key("table.exec.iceberg.infer-source-parallelism.max") + .intType() + .defaultValue(100) + .withDescription("Sets max infer parallelism for source operator."); + + public static final ConfigOption TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO = + ConfigOptions.key("table.exec.iceberg.expose-split-locality-info") + .booleanType() + .noDefaultValue() + .withDescription( + "Expose split host information to use Flink's locality aware split assigner."); + + public static final ConfigOption SOURCE_READER_FETCH_BATCH_RECORD_COUNT = + ConfigOptions.key("table.exec.iceberg.fetch-batch-record-count") + .intType() + .defaultValue(2048) + .withDescription("The target number of records for Iceberg reader fetch batch."); + + public static final ConfigOption TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE = + ConfigOptions.key("table.exec.iceberg.worker-pool-size") + .intType() + .defaultValue(ThreadPools.WORKER_THREAD_POOL_SIZE) + .withDescription("The size of workers pool used to plan or scan manifests."); + + public static final ConfigOption TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE = + ConfigOptions.key("table.exec.iceberg.use-flip27-source") + .booleanType() + .defaultValue(true) + .withDescription("Use the FLIP-27 based Iceberg source implementation."); + + public static final ConfigOption TABLE_EXEC_ICEBERG_USE_V2_SINK = + ConfigOptions.key("table.exec.iceberg.use-v2-sink") + .booleanType() + .defaultValue(false) + .withDescription("Use the SinkV2 API based Iceberg sink implementation."); + + public static final ConfigOption TABLE_EXEC_SPLIT_ASSIGNER_TYPE = + ConfigOptions.key("table.exec.iceberg.split-assigner-type") + .enumType(SplitAssignerType.class) + .defaultValue(SplitAssignerType.SIMPLE) + .withDescription( + Description.builder() + .text("Split assigner type that determine how splits are assigned to readers.") + .linebreak() + .list( + TextElement.text( + SplitAssignerType.SIMPLE + + ": simple assigner that doesn't provide any guarantee on order or locality.")) + .build()); +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkCreateTableOptions.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkCreateTableOptions.java new file mode 100644 index 000000000000..ab69ec5adc7f --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkCreateTableOptions.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.util.Map; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.iceberg.util.JsonUtil; + +class FlinkCreateTableOptions { + private final String catalogName; + private final String catalogDb; + private final String catalogTable; + private final Map catalogProps; + + private FlinkCreateTableOptions( + String catalogName, String catalogDb, String catalogTable, Map props) { + this.catalogName = catalogName; + this.catalogDb = catalogDb; + this.catalogTable = catalogTable; + this.catalogProps = props; + } + + public static final ConfigOption CATALOG_NAME = + ConfigOptions.key("catalog-name") + .stringType() + .noDefaultValue() + .withDescription("Catalog name"); + + public static final ConfigOption CATALOG_TYPE = + ConfigOptions.key(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE) + .stringType() + .noDefaultValue() + .withDescription("Catalog type, the optional types are: custom, hadoop, hive."); + + public static final ConfigOption CATALOG_DATABASE = + ConfigOptions.key("catalog-database") + .stringType() + .defaultValue(FlinkCatalogFactory.DEFAULT_DATABASE_NAME) + .withDescription("Database name managed in the iceberg catalog."); + + public static final ConfigOption CATALOG_TABLE = + ConfigOptions.key("catalog-table") + .stringType() + .noDefaultValue() + .withDescription("Table name managed in the underlying iceberg catalog and database."); + + public static final ConfigOption> CATALOG_PROPS = + ConfigOptions.key("catalog-props") + .mapType() + .noDefaultValue() + .withDescription("Properties for the underlying catalog for iceberg table."); + + public static final String SRC_CATALOG_PROPS_KEY = "src-catalog"; + public static final String CONNECTOR_PROPS_KEY = "connector"; + public static final String LOCATION_KEY = "location"; + + static String toJson( + String catalogName, String catalogDb, String catalogTable, Map catalogProps) { + return JsonUtil.generate( + gen -> { + gen.writeStartObject(); + gen.writeStringField(CATALOG_NAME.key(), catalogName); + gen.writeStringField(CATALOG_DATABASE.key(), catalogDb); + gen.writeStringField(CATALOG_TABLE.key(), catalogTable); + JsonUtil.writeStringMap(CATALOG_PROPS.key(), catalogProps, gen); + gen.writeEndObject(); + }, + false); + } + + static FlinkCreateTableOptions fromJson(String createTableOptions) { + return JsonUtil.parse( + createTableOptions, + node -> { + String catalogName = JsonUtil.getString(CATALOG_NAME.key(), node); + String catalogDb = JsonUtil.getString(CATALOG_DATABASE.key(), node); + String catalogTable = JsonUtil.getString(CATALOG_TABLE.key(), node); + Map catalogProps = JsonUtil.getStringMap(CATALOG_PROPS.key(), node); + + return new FlinkCreateTableOptions(catalogName, catalogDb, catalogTable, catalogProps); + }); + } + + String catalogName() { + return catalogName; + } + + String catalogDb() { + return catalogDb; + } + + String catalogTable() { + return catalogTable; + } + + Map catalogProps() { + return catalogProps; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java new file mode 100644 index 000000000000..bd79c1156090 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.util.Collections; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.catalog.CatalogDatabaseImpl; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ObjectIdentifier; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.ResolvedCatalogTable; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; +import org.apache.flink.table.connector.sink.DynamicTableSink; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.factories.DynamicTableSinkFactory; +import org.apache.flink.table.factories.DynamicTableSourceFactory; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.exceptions.AlreadyExistsException; +import org.apache.iceberg.flink.source.IcebergTableSource; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; + +public class FlinkDynamicTableFactory + implements DynamicTableSinkFactory, DynamicTableSourceFactory { + static final String FACTORY_IDENTIFIER = "iceberg"; + private final FlinkCatalog catalog; + + public FlinkDynamicTableFactory() { + this.catalog = null; + } + + public FlinkDynamicTableFactory(FlinkCatalog catalog) { + this.catalog = catalog; + } + + @Override + public DynamicTableSource createDynamicTableSource(Context context) { + ObjectIdentifier objectIdentifier = context.getObjectIdentifier(); + ResolvedCatalogTable resolvedCatalogTable = context.getCatalogTable(); + Map tableProps = resolvedCatalogTable.getOptions(); + ResolvedSchema resolvedSchema = + ResolvedSchema.of( + resolvedCatalogTable.getResolvedSchema().getColumns().stream() + .filter(Column::isPhysical) + .collect(Collectors.toList())); + + TableLoader tableLoader; + if (catalog != null) { + tableLoader = createTableLoader(catalog, objectIdentifier.toObjectPath()); + } else { + tableLoader = + createTableLoader( + resolvedCatalogTable, + tableProps, + objectIdentifier.getDatabaseName(), + objectIdentifier.getObjectName()); + } + + return new IcebergTableSource( + tableLoader, resolvedSchema, tableProps, context.getConfiguration()); + } + + @Override + public DynamicTableSink createDynamicTableSink(Context context) { + ObjectIdentifier objectIdentifier = context.getObjectIdentifier(); + ResolvedCatalogTable resolvedCatalogTable = context.getCatalogTable(); + Map writeProps = resolvedCatalogTable.getOptions(); + ResolvedSchema resolvedSchema = + ResolvedSchema.of( + resolvedCatalogTable.getResolvedSchema().getColumns().stream() + .filter(Column::isPhysical) + .collect(Collectors.toList())); + + TableLoader tableLoader; + if (catalog != null) { + tableLoader = createTableLoader(catalog, objectIdentifier.toObjectPath()); + } else { + tableLoader = + createTableLoader( + resolvedCatalogTable, + writeProps, + objectIdentifier.getDatabaseName(), + objectIdentifier.getObjectName()); + } + + return new IcebergTableSink( + tableLoader, resolvedSchema, context.getConfiguration(), writeProps); + } + + @Override + public Set> requiredOptions() { + Set> options = Sets.newHashSet(); + options.add(FlinkCreateTableOptions.CATALOG_TYPE); + options.add(FlinkCreateTableOptions.CATALOG_NAME); + return options; + } + + @Override + public Set> optionalOptions() { + Set> options = Sets.newHashSet(); + options.add(FlinkCreateTableOptions.CATALOG_DATABASE); + options.add(FlinkCreateTableOptions.CATALOG_TABLE); + return options; + } + + @Override + public String factoryIdentifier() { + return FACTORY_IDENTIFIER; + } + + private static TableLoader createTableLoader( + ResolvedCatalogTable resolvedCatalogTable, + Map tableProps, + String databaseName, + String tableName) { + Configuration flinkConf = new Configuration(); + + Map mergedProps = mergeSrcCatalogProps(tableProps); + + mergedProps.forEach(flinkConf::setString); + + String catalogName = flinkConf.get(FlinkCreateTableOptions.CATALOG_NAME); + Preconditions.checkNotNull( + catalogName, + "Table property '%s' cannot be null", + FlinkCreateTableOptions.CATALOG_NAME.key()); + + String catalogDatabase = flinkConf.get(FlinkCreateTableOptions.CATALOG_DATABASE, databaseName); + Preconditions.checkNotNull(catalogDatabase, "The iceberg database name cannot be null"); + + String catalogTable = flinkConf.get(FlinkCreateTableOptions.CATALOG_TABLE, tableName); + Preconditions.checkNotNull(catalogTable, "The iceberg table name cannot be null"); + + org.apache.hadoop.conf.Configuration hadoopConf = FlinkCatalogFactory.clusterHadoopConf(); + FlinkCatalogFactory factory = new FlinkCatalogFactory(); + FlinkCatalog flinkCatalog = + (FlinkCatalog) factory.createCatalog(catalogName, mergedProps, hadoopConf); + ObjectPath objectPath = new ObjectPath(catalogDatabase, catalogTable); + + // Create database if not exists in the external catalog. + if (!flinkCatalog.databaseExists(catalogDatabase)) { + try { + flinkCatalog.createDatabase( + catalogDatabase, new CatalogDatabaseImpl(Maps.newHashMap(), null), true); + } catch (DatabaseAlreadyExistException e) { + throw new AlreadyExistsException( + e, + "Database %s already exists in the iceberg catalog %s.", + catalogName, + catalogDatabase); + } + } + + // Create table if not exists in the external catalog. + if (!flinkCatalog.tableExists(objectPath)) { + try { + flinkCatalog.createIcebergTable(objectPath, resolvedCatalogTable, true); + } catch (TableAlreadyExistException e) { + throw new AlreadyExistsException( + e, + "Table %s already exists in the database %s and catalog %s", + catalogTable, + catalogDatabase, + catalogName); + } + } + + return TableLoader.fromCatalog( + flinkCatalog.getCatalogLoader(), TableIdentifier.of(catalogDatabase, catalogTable)); + } + + /** + * Merges source catalog properties with connector properties. Iceberg Catalog properties are + * serialized as json in FlinkCatalog#getTable to be able to isolate catalog props from iceberg + * table props, Here, we flatten and merge them back to use to create catalog. + * + * @param tableProps the existing table properties + * @return a map of merged properties, with source catalog properties taking precedence when keys + * conflict + */ + private static Map mergeSrcCatalogProps(Map tableProps) { + String srcCatalogProps = tableProps.get(FlinkCreateTableOptions.SRC_CATALOG_PROPS_KEY); + if (srcCatalogProps != null) { + Map mergedProps = Maps.newHashMap(); + FlinkCreateTableOptions createTableOptions = + FlinkCreateTableOptions.fromJson(srcCatalogProps); + + mergedProps.put(FlinkCreateTableOptions.CATALOG_NAME.key(), createTableOptions.catalogName()); + mergedProps.put( + FlinkCreateTableOptions.CATALOG_DATABASE.key(), createTableOptions.catalogDb()); + mergedProps.put( + FlinkCreateTableOptions.CATALOG_TABLE.key(), createTableOptions.catalogTable()); + mergedProps.putAll(createTableOptions.catalogProps()); + + tableProps.forEach( + (k, v) -> { + if (!FlinkCreateTableOptions.SRC_CATALOG_PROPS_KEY.equals(k)) { + mergedProps.put(k, v); + } + }); + + return Collections.unmodifiableMap(mergedProps); + } + + return tableProps; + } + + private static TableLoader createTableLoader(FlinkCatalog catalog, ObjectPath objectPath) { + Preconditions.checkNotNull(catalog, "Flink catalog cannot be null"); + return TableLoader.fromCatalog(catalog.getCatalogLoader(), catalog.toIdentifier(objectPath)); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java new file mode 100644 index 000000000000..f35bb577fbba --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import org.apache.iceberg.EnvironmentContext; +import org.apache.iceberg.flink.util.FlinkPackage; + +class FlinkEnvironmentContext { + private FlinkEnvironmentContext() {} + + public static void init() { + EnvironmentContext.put(EnvironmentContext.ENGINE_NAME, "flink"); + EnvironmentContext.put(EnvironmentContext.ENGINE_VERSION, FlinkPackage.version()); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java new file mode 100644 index 000000000000..f2244d5137a1 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java @@ -0,0 +1,266 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.function.BiFunction; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.flink.table.expressions.CallExpression; +import org.apache.flink.table.expressions.FieldReferenceExpression; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.flink.table.expressions.ValueLiteralExpression; +import org.apache.flink.table.functions.BuiltInFunctionDefinitions; +import org.apache.flink.table.functions.FunctionDefinition; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expression.Operation; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.util.DateTimeUtil; +import org.apache.iceberg.util.NaNUtil; + +public class FlinkFilters { + private FlinkFilters() {} + + private static final Pattern STARTS_WITH_PATTERN = Pattern.compile("([^%]+)%"); + + private static final Map FILTERS = + ImmutableMap.builder() + .put(BuiltInFunctionDefinitions.EQUALS, Operation.EQ) + .put(BuiltInFunctionDefinitions.NOT_EQUALS, Operation.NOT_EQ) + .put(BuiltInFunctionDefinitions.GREATER_THAN, Operation.GT) + .put(BuiltInFunctionDefinitions.GREATER_THAN_OR_EQUAL, Operation.GT_EQ) + .put(BuiltInFunctionDefinitions.LESS_THAN, Operation.LT) + .put(BuiltInFunctionDefinitions.LESS_THAN_OR_EQUAL, Operation.LT_EQ) + .put(BuiltInFunctionDefinitions.IS_NULL, Operation.IS_NULL) + .put(BuiltInFunctionDefinitions.IS_NOT_NULL, Operation.NOT_NULL) + .put(BuiltInFunctionDefinitions.AND, Operation.AND) + .put(BuiltInFunctionDefinitions.OR, Operation.OR) + .put(BuiltInFunctionDefinitions.NOT, Operation.NOT) + .put(BuiltInFunctionDefinitions.LIKE, Operation.STARTS_WITH) + .buildOrThrow(); + + /** + * Convert flink expression to iceberg expression. + * + *

the BETWEEN, NOT_BETWEEN, IN expression will be converted by flink automatically. the + * BETWEEN will be converted to (GT_EQ AND LT_EQ), the NOT_BETWEEN will be converted to (LT_EQ OR + * GT_EQ), the IN will be converted to OR, so we do not add the conversion here + * + * @param flinkExpression the flink expression + * @return the iceberg expression + */ + public static Optional convert( + org.apache.flink.table.expressions.Expression flinkExpression) { + if (!(flinkExpression instanceof CallExpression)) { + return Optional.empty(); + } + + CallExpression call = (CallExpression) flinkExpression; + Operation op = FILTERS.get(call.getFunctionDefinition()); + if (op != null) { + switch (op) { + case IS_NULL: + return onlyChildAs(call, FieldReferenceExpression.class) + .map(FieldReferenceExpression::getName) + .map(Expressions::isNull); + + case NOT_NULL: + return onlyChildAs(call, FieldReferenceExpression.class) + .map(FieldReferenceExpression::getName) + .map(Expressions::notNull); + + case LT: + return convertFieldAndLiteral(Expressions::lessThan, Expressions::greaterThan, call); + + case LT_EQ: + return convertFieldAndLiteral( + Expressions::lessThanOrEqual, Expressions::greaterThanOrEqual, call); + + case GT: + return convertFieldAndLiteral(Expressions::greaterThan, Expressions::lessThan, call); + + case GT_EQ: + return convertFieldAndLiteral( + Expressions::greaterThanOrEqual, Expressions::lessThanOrEqual, call); + + case EQ: + return convertFieldAndLiteral( + (ref, lit) -> { + if (NaNUtil.isNaN(lit)) { + return Expressions.isNaN(ref); + } else { + return Expressions.equal(ref, lit); + } + }, + call); + + case NOT_EQ: + return convertFieldAndLiteral( + (ref, lit) -> { + if (NaNUtil.isNaN(lit)) { + return Expressions.notNaN(ref); + } else { + return Expressions.notEqual(ref, lit); + } + }, + call); + + case NOT: + return onlyChildAs(call, CallExpression.class) + .flatMap(FlinkFilters::convert) + .map(Expressions::not); + + case AND: + return convertLogicExpression(Expressions::and, call); + + case OR: + return convertLogicExpression(Expressions::or, call); + + case STARTS_WITH: + return convertLike(call); + } + } + + return Optional.empty(); + } + + private static Optional onlyChildAs( + CallExpression call, Class expectedChildClass) { + List children = call.getResolvedChildren(); + if (children.size() != 1) { + return Optional.empty(); + } + + ResolvedExpression child = children.get(0); + if (!expectedChildClass.isInstance(child)) { + return Optional.empty(); + } + + return Optional.of(expectedChildClass.cast(child)); + } + + private static Optional convertLike(CallExpression call) { + List args = call.getResolvedChildren(); + if (args.size() != 2) { + return Optional.empty(); + } + + org.apache.flink.table.expressions.Expression left = args.get(0); + org.apache.flink.table.expressions.Expression right = args.get(1); + + if (left instanceof FieldReferenceExpression && right instanceof ValueLiteralExpression) { + String name = ((FieldReferenceExpression) left).getName(); + return convertLiteral((ValueLiteralExpression) right) + .flatMap( + lit -> { + if (lit instanceof String) { + String pattern = (String) lit; + Matcher matcher = STARTS_WITH_PATTERN.matcher(pattern); + // exclude special char of LIKE + // '_' is the wildcard of the SQL LIKE + if (!pattern.contains("_") && matcher.matches()) { + return Optional.of(Expressions.startsWith(name, matcher.group(1))); + } + } + + return Optional.empty(); + }); + } + + return Optional.empty(); + } + + private static Optional convertLogicExpression( + BiFunction function, CallExpression call) { + List args = call.getResolvedChildren(); + if (args == null || args.size() != 2) { + return Optional.empty(); + } + + Optional left = convert(args.get(0)); + Optional right = convert(args.get(1)); + if (left.isPresent() && right.isPresent()) { + return Optional.of(function.apply(left.get(), right.get())); + } + + return Optional.empty(); + } + + private static Optional convertLiteral(ValueLiteralExpression expression) { + Optional value = + expression.getValueAs( + expression.getOutputDataType().getLogicalType().getDefaultConversion()); + return value.map( + o -> { + if (o instanceof LocalDateTime) { + return DateTimeUtil.microsFromTimestamp((LocalDateTime) o); + } else if (o instanceof Instant) { + return DateTimeUtil.microsFromInstant((Instant) o); + } else if (o instanceof LocalTime) { + return DateTimeUtil.microsFromTime((LocalTime) o); + } else if (o instanceof LocalDate) { + return DateTimeUtil.daysFromDate((LocalDate) o); + } + + return o; + }); + } + + private static Optional convertFieldAndLiteral( + BiFunction expr, CallExpression call) { + return convertFieldAndLiteral(expr, expr, call); + } + + private static Optional convertFieldAndLiteral( + BiFunction convertLR, + BiFunction convertRL, + CallExpression call) { + List args = call.getResolvedChildren(); + if (args.size() != 2) { + return Optional.empty(); + } + + org.apache.flink.table.expressions.Expression left = args.get(0); + org.apache.flink.table.expressions.Expression right = args.get(1); + + if (left instanceof FieldReferenceExpression && right instanceof ValueLiteralExpression) { + String name = ((FieldReferenceExpression) left).getName(); + Optional lit = convertLiteral((ValueLiteralExpression) right); + if (lit.isPresent()) { + return Optional.of(convertLR.apply(name, lit.get())); + } + } else if (left instanceof ValueLiteralExpression + && right instanceof FieldReferenceExpression) { + Optional lit = convertLiteral((ValueLiteralExpression) left); + String name = ((FieldReferenceExpression) right).getName(); + if (lit.isPresent()) { + return Optional.of(convertRL.apply(name, lit.get())); + } + } + + return Optional.empty(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java new file mode 100644 index 000000000000..767d4497ac91 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import org.apache.iceberg.Schema; +import org.apache.iceberg.types.FixupTypes; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.types.Types; + +/** + * The uuid and fixed are converted to the same Flink type. Conversion back can produce only one, + * which may not be correct. + */ +class FlinkFixupTypes extends FixupTypes { + + private FlinkFixupTypes(Schema referenceSchema) { + super(referenceSchema); + } + + static Schema fixup(Schema schema, Schema referenceSchema) { + return new Schema( + TypeUtil.visit(schema, new FlinkFixupTypes(referenceSchema)).asStructType().fields()); + } + + @Override + protected boolean fixupPrimitive(Type.PrimitiveType type, Type source) { + if (type instanceof Types.FixedType) { + int length = ((Types.FixedType) type).length(); + return source.typeId() == Type.TypeID.UUID && length == 16; + } + return false; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java new file mode 100644 index 000000000000..804a956ec9b9 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.time.Duration; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.util.TimeUtils; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.source.StreamingStartingStrategy; + +public class FlinkReadConf { + + private final FlinkConfParser confParser; + + public FlinkReadConf( + Table table, Map readOptions, ReadableConfig readableConfig) { + this.confParser = new FlinkConfParser(table, readOptions, readableConfig); + } + + public Long snapshotId() { + return confParser.longConf().option(FlinkReadOptions.SNAPSHOT_ID.key()).parseOptional(); + } + + public String tag() { + return confParser.stringConf().option(FlinkReadOptions.TAG.key()).parseOptional(); + } + + public String startTag() { + return confParser.stringConf().option(FlinkReadOptions.START_TAG.key()).parseOptional(); + } + + public String endTag() { + return confParser.stringConf().option(FlinkReadOptions.END_TAG.key()).parseOptional(); + } + + public String branch() { + return confParser.stringConf().option(FlinkReadOptions.BRANCH.key()).parseOptional(); + } + + public boolean caseSensitive() { + return confParser + .booleanConf() + .option(FlinkReadOptions.CASE_SENSITIVE) + .flinkConfig(FlinkReadOptions.CASE_SENSITIVE_OPTION) + .defaultValue(FlinkReadOptions.CASE_SENSITIVE_OPTION.defaultValue()) + .parse(); + } + + public Long asOfTimestamp() { + return confParser.longConf().option(FlinkReadOptions.AS_OF_TIMESTAMP.key()).parseOptional(); + } + + public StreamingStartingStrategy startingStrategy() { + return confParser + .enumConfParser(StreamingStartingStrategy.class) + .option(FlinkReadOptions.STARTING_STRATEGY) + .flinkConfig(FlinkReadOptions.STARTING_STRATEGY_OPTION) + .defaultValue(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .parse(); + } + + public Long startSnapshotTimestamp() { + return confParser + .longConf() + .option(FlinkReadOptions.START_SNAPSHOT_TIMESTAMP.key()) + .parseOptional(); + } + + public Long startSnapshotId() { + return confParser.longConf().option(FlinkReadOptions.START_SNAPSHOT_ID.key()).parseOptional(); + } + + public Long endSnapshotId() { + return confParser.longConf().option(FlinkReadOptions.END_SNAPSHOT_ID.key()).parseOptional(); + } + + public long splitSize() { + return confParser + .longConf() + .option(FlinkReadOptions.SPLIT_SIZE) + .flinkConfig(FlinkReadOptions.SPLIT_SIZE_OPTION) + .tableProperty(TableProperties.SPLIT_SIZE) + .defaultValue(TableProperties.SPLIT_SIZE_DEFAULT) + .parse(); + } + + public int splitLookback() { + return confParser + .intConf() + .option(FlinkReadOptions.SPLIT_LOOKBACK) + .flinkConfig(FlinkReadOptions.SPLIT_LOOKBACK_OPTION) + .tableProperty(TableProperties.SPLIT_LOOKBACK) + .defaultValue(TableProperties.SPLIT_LOOKBACK_DEFAULT) + .parse(); + } + + public long splitFileOpenCost() { + return confParser + .longConf() + .option(FlinkReadOptions.SPLIT_FILE_OPEN_COST) + .flinkConfig(FlinkReadOptions.SPLIT_FILE_OPEN_COST_OPTION) + .tableProperty(TableProperties.SPLIT_OPEN_FILE_COST) + .defaultValue(TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT) + .parse(); + } + + public boolean streaming() { + return confParser + .booleanConf() + .option(FlinkReadOptions.STREAMING) + .flinkConfig(FlinkReadOptions.STREAMING_OPTION) + .defaultValue(FlinkReadOptions.STREAMING_OPTION.defaultValue()) + .parse(); + } + + public Duration monitorInterval() { + String duration = + confParser + .stringConf() + .option(FlinkReadOptions.MONITOR_INTERVAL) + .flinkConfig(FlinkReadOptions.MONITOR_INTERVAL_OPTION) + .defaultValue(FlinkReadOptions.MONITOR_INTERVAL_OPTION.defaultValue()) + .parse(); + + return TimeUtils.parseDuration(duration); + } + + public boolean includeColumnStats() { + return confParser + .booleanConf() + .option(FlinkReadOptions.INCLUDE_COLUMN_STATS) + .flinkConfig(FlinkReadOptions.INCLUDE_COLUMN_STATS_OPTION) + .defaultValue(FlinkReadOptions.INCLUDE_COLUMN_STATS_OPTION.defaultValue()) + .parse(); + } + + public int maxPlanningSnapshotCount() { + return confParser + .intConf() + .option(FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT) + .flinkConfig(FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT_OPTION) + .defaultValue(FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT_OPTION.defaultValue()) + .parse(); + } + + public String nameMapping() { + return confParser.stringConf().option(TableProperties.DEFAULT_NAME_MAPPING).parseOptional(); + } + + public long limit() { + return confParser + .longConf() + .option(FlinkReadOptions.LIMIT) + .flinkConfig(FlinkReadOptions.LIMIT_OPTION) + .defaultValue(FlinkReadOptions.LIMIT_OPTION.defaultValue()) + .parse(); + } + + public int workerPoolSize() { + return confParser + .intConf() + .option(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.key()) + .flinkConfig(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE) + .defaultValue(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue()) + .parse(); + } + + public int maxAllowedPlanningFailures() { + return confParser + .intConf() + .option(FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES) + .flinkConfig(FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION) + .defaultValue(FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION.defaultValue()) + .parse(); + } + + public String watermarkColumn() { + return confParser + .stringConf() + .option(FlinkReadOptions.WATERMARK_COLUMN) + .flinkConfig(FlinkReadOptions.WATERMARK_COLUMN_OPTION) + .defaultValue(FlinkReadOptions.WATERMARK_COLUMN_OPTION.defaultValue()) + .parseOptional(); + } + + public TimeUnit watermarkColumnTimeUnit() { + return confParser + .enumConfParser(TimeUnit.class) + .option(FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT) + .flinkConfig(FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT_OPTION) + .defaultValue(FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT_OPTION.defaultValue()) + .parse(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java new file mode 100644 index 000000000000..1bbd88146c8f --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.util.concurrent.TimeUnit; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.source.StreamingStartingStrategy; + +/** Flink source read options */ +public class FlinkReadOptions { + private static final String PREFIX = "connector.iceberg."; + + private FlinkReadOptions() {} + + public static final ConfigOption SNAPSHOT_ID = + ConfigOptions.key("snapshot-id").longType().defaultValue(null); + + public static final ConfigOption TAG = + ConfigOptions.key("tag").stringType().defaultValue(null); + + public static final ConfigOption BRANCH = + ConfigOptions.key("branch").stringType().defaultValue(null); + + public static final ConfigOption START_TAG = + ConfigOptions.key("start-tag").stringType().defaultValue(null); + + public static final ConfigOption END_TAG = + ConfigOptions.key("end-tag").stringType().defaultValue(null); + + public static final String CASE_SENSITIVE = "case-sensitive"; + public static final ConfigOption CASE_SENSITIVE_OPTION = + ConfigOptions.key(PREFIX + CASE_SENSITIVE).booleanType().defaultValue(false); + + public static final ConfigOption AS_OF_TIMESTAMP = + ConfigOptions.key("as-of-timestamp").longType().defaultValue(null); + + public static final String STARTING_STRATEGY = "starting-strategy"; + public static final ConfigOption STARTING_STRATEGY_OPTION = + ConfigOptions.key(PREFIX + STARTING_STRATEGY) + .enumType(StreamingStartingStrategy.class) + .defaultValue(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT); + + public static final ConfigOption START_SNAPSHOT_TIMESTAMP = + ConfigOptions.key("start-snapshot-timestamp").longType().defaultValue(null); + + public static final ConfigOption START_SNAPSHOT_ID = + ConfigOptions.key("start-snapshot-id").longType().defaultValue(null); + + public static final ConfigOption END_SNAPSHOT_ID = + ConfigOptions.key("end-snapshot-id").longType().defaultValue(null); + + public static final String SPLIT_SIZE = "split-size"; + public static final ConfigOption SPLIT_SIZE_OPTION = + ConfigOptions.key(PREFIX + SPLIT_SIZE) + .longType() + .defaultValue(TableProperties.SPLIT_SIZE_DEFAULT); + + public static final String SPLIT_LOOKBACK = "split-lookback"; + public static final ConfigOption SPLIT_LOOKBACK_OPTION = + ConfigOptions.key(PREFIX + SPLIT_LOOKBACK) + .intType() + .defaultValue(TableProperties.SPLIT_LOOKBACK_DEFAULT); + + public static final String SPLIT_FILE_OPEN_COST = "split-file-open-cost"; + public static final ConfigOption SPLIT_FILE_OPEN_COST_OPTION = + ConfigOptions.key(PREFIX + SPLIT_FILE_OPEN_COST) + .longType() + .defaultValue(TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); + + public static final String STREAMING = "streaming"; + public static final ConfigOption STREAMING_OPTION = + ConfigOptions.key(PREFIX + STREAMING).booleanType().defaultValue(false); + + public static final String MONITOR_INTERVAL = "monitor-interval"; + public static final ConfigOption MONITOR_INTERVAL_OPTION = + ConfigOptions.key(PREFIX + MONITOR_INTERVAL).stringType().defaultValue("60s"); + + public static final String INCLUDE_COLUMN_STATS = "include-column-stats"; + public static final ConfigOption INCLUDE_COLUMN_STATS_OPTION = + ConfigOptions.key(PREFIX + INCLUDE_COLUMN_STATS).booleanType().defaultValue(false); + + public static final String MAX_PLANNING_SNAPSHOT_COUNT = "max-planning-snapshot-count"; + public static final ConfigOption MAX_PLANNING_SNAPSHOT_COUNT_OPTION = + ConfigOptions.key(PREFIX + MAX_PLANNING_SNAPSHOT_COUNT) + .intType() + .defaultValue(Integer.MAX_VALUE); + + public static final String LIMIT = "limit"; + public static final ConfigOption LIMIT_OPTION = + ConfigOptions.key(PREFIX + LIMIT).longType().defaultValue(-1L); + + public static final String MAX_ALLOWED_PLANNING_FAILURES = "max-allowed-planning-failures"; + public static final ConfigOption MAX_ALLOWED_PLANNING_FAILURES_OPTION = + ConfigOptions.key(PREFIX + MAX_ALLOWED_PLANNING_FAILURES).intType().defaultValue(3); + + public static final String WATERMARK_COLUMN = "watermark-column"; + public static final ConfigOption WATERMARK_COLUMN_OPTION = + ConfigOptions.key(PREFIX + WATERMARK_COLUMN).stringType().noDefaultValue(); + + public static final String WATERMARK_COLUMN_TIME_UNIT = "watermark-column-time-unit"; + public static final ConfigOption WATERMARK_COLUMN_TIME_UNIT_OPTION = + ConfigOptions.key(PREFIX + WATERMARK_COLUMN_TIME_UNIT) + .enumType(TimeUnit.class) + .defaultValue(TimeUnit.MICROSECONDS); +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkRowData.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkRowData.java new file mode 100644 index 000000000000..06c1635312b9 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkRowData.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.NullType; + +public class FlinkRowData { + + private FlinkRowData() {} + + public static RowData.FieldGetter createFieldGetter(LogicalType fieldType, int fieldPos) { + if (fieldType instanceof NullType) { + return rowData -> null; + } + + RowData.FieldGetter flinkFieldGetter = RowData.createFieldGetter(fieldType, fieldPos); + return rowData -> { + // Be sure to check for null values, even if the field is required. Flink + // RowData.createFieldGetter(..) does not null-check optional / nullable types. Without this + // explicit null check, the null flag of BinaryRowData will be ignored and random bytes will + // be parsed as actual values. This will produce incorrect writes instead of failing with a + // NullPointerException. See https://issues.apache.org/jira/browse/FLINK-37245 + if (!fieldType.isNullable() && rowData.isNullAt(fieldPos)) { + return null; + } + return flinkFieldGetter.getFieldOrNull(rowData); + }; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java new file mode 100644 index 000000000000..7f55d4b07bb1 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java @@ -0,0 +1,380 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import java.util.function.Function; +import java.util.stream.Collectors; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.ValidationException; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.catalog.UniqueConstraint; +import org.apache.flink.table.legacy.api.TableSchema; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.iceberg.Schema; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.types.Types; + +/** + * Converter between Flink types and Iceberg type. The conversion is not a 1:1 mapping that not + * allows back-and-forth conversion. So some information might get lost during the back-and-forth + * conversion. + * + *

This inconsistent types: + * + *

    + *
  • map Iceberg UUID type to Flink BinaryType(16) + *
  • map Flink VarCharType(_) and CharType(_) to Iceberg String type + *
  • map Flink VarBinaryType(_) to Iceberg Binary type + *
  • map Flink TimeType(_) to Iceberg Time type (microseconds) + *
  • map Flink TimestampType(_) to Iceberg Timestamp without zone type (microseconds) + *
  • map Flink LocalZonedTimestampType(_) to Iceberg Timestamp with zone type (microseconds) + *
  • map Flink MultiSetType to Iceberg Map type(element, int) + *
+ * + *

+ */ +public class FlinkSchemaUtil { + + private FlinkSchemaUtil() {} + + /** + * @deprecated will be removed in 2.0.0; use {@link #convert(ResolvedSchema)} instead. + */ + @Deprecated + public static Schema convert(TableSchema schema) { + LogicalType schemaType = schema.toRowDataType().getLogicalType(); + Preconditions.checkArgument( + schemaType instanceof RowType, "Schema logical type should be row type."); + + RowType root = (RowType) schemaType; + Type converted = root.accept(new FlinkTypeToType(root)); + + Schema icebergSchema = new Schema(converted.asStructType().fields()); + if (schema.getPrimaryKey().isPresent()) { + return freshIdentifierFieldIds(icebergSchema, schema.getPrimaryKey().get().getColumns()); + } else { + return icebergSchema; + } + } + + /** Convert the flink table schema to apache iceberg schema with column comment. */ + public static Schema convert(ResolvedSchema flinkSchema) { + List tableColumns = flinkSchema.getColumns(); + // copy from org.apache.flink.table.api.Schema#toRowDataType + DataTypes.Field[] fields = + tableColumns.stream() + .map( + column -> { + if (column.getComment().isPresent()) { + return DataTypes.FIELD( + column.getName(), column.getDataType(), column.getComment().get()); + } else { + return DataTypes.FIELD(column.getName(), column.getDataType()); + } + }) + .toArray(DataTypes.Field[]::new); + + LogicalType schemaType = DataTypes.ROW(fields).notNull().getLogicalType(); + Preconditions.checkArgument( + schemaType instanceof RowType, "Schema logical type should be row type."); + + RowType root = (RowType) schemaType; + Type converted = root.accept(new FlinkTypeToType(root)); + Schema icebergSchema = new Schema(converted.asStructType().fields()); + return flinkSchema + .getPrimaryKey() + .map(pk -> freshIdentifierFieldIds(icebergSchema, pk.getColumns())) + .orElse(icebergSchema); + } + + private static Schema freshIdentifierFieldIds(Schema icebergSchema, List primaryKeys) { + // Locate the identifier field id list. + Set identifierFieldIds = Sets.newHashSet(); + for (String primaryKey : primaryKeys) { + Types.NestedField field = icebergSchema.findField(primaryKey); + Preconditions.checkNotNull( + field, + "Cannot find field ID for the primary key column %s in schema %s", + primaryKey, + icebergSchema); + identifierFieldIds.add(field.fieldId()); + } + return new Schema( + icebergSchema.schemaId(), icebergSchema.asStruct().fields(), identifierFieldIds); + } + + /** + * Convert a Flink {@link TableSchema} to a {@link Schema} based on the given schema. + * + *

This conversion does not assign new ids; it uses ids from the base schema. + * + *

Data types, field order, and nullability will match the Flink type. This conversion may + * return a schema that is not compatible with base schema. + * + * @param baseSchema a Schema on which conversion is based + * @param flinkSchema a Flink TableSchema + * @return the equivalent Schema + * @throws IllegalArgumentException if the type cannot be converted or there are missing ids + * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link #convert(Schema, + * ResolvedSchema)} instead. + */ + @Deprecated + public static Schema convert(Schema baseSchema, TableSchema flinkSchema) { + // convert to a type with fresh ids + Types.StructType struct = convert(flinkSchema).asStruct(); + // reassign ids to match the base schema + Schema schema = TypeUtil.reassignIds(new Schema(struct.fields()), baseSchema); + // reassign doc to match the base schema + schema = TypeUtil.reassignDoc(schema, baseSchema); + + // fix types that can't be represented in Flink (UUID) + Schema fixedSchema = FlinkFixupTypes.fixup(schema, baseSchema); + if (flinkSchema.getPrimaryKey().isPresent()) { + return freshIdentifierFieldIds(fixedSchema, flinkSchema.getPrimaryKey().get().getColumns()); + } else { + return fixedSchema; + } + } + + /** + * Convert a Flink {@link ResolvedSchema} to a {@link Schema} based on the given schema. + * + *

This conversion does not assign new ids; it uses ids from the base schema. + * + *

Data types, field order, and nullability will match the Flink type. This conversion may + * return a schema that is not compatible with base schema. + * + * @param baseSchema a Schema on which conversion is based + * @param flinkSchema a Flink ResolvedSchema + * @return the equivalent Schema + * @throws IllegalArgumentException if the type cannot be converted or there are missing ids + */ + public static Schema convert(Schema baseSchema, ResolvedSchema flinkSchema) { + // convert to a type with fresh ids + Types.StructType struct = convert(flinkSchema).asStruct(); + // reassign ids to match the base schema + Schema schema = TypeUtil.reassignIds(new Schema(struct.fields()), baseSchema); + // reassign doc to match the base schema + schema = TypeUtil.reassignDoc(schema, baseSchema); + + // fix types that can't be represented in Flink (UUID) + Schema fixedSchema = FlinkFixupTypes.fixup(schema, baseSchema); + return flinkSchema + .getPrimaryKey() + .map(pk -> freshIdentifierFieldIds(fixedSchema, pk.getColumns())) + .orElse(fixedSchema); + } + + /** + * Convert a {@link Schema} to a {@link RowType Flink type}. + * + * @param schema a Schema + * @return the equivalent Flink type + * @throws IllegalArgumentException if the type cannot be converted to Flink + */ + public static RowType convert(Schema schema) { + return (RowType) TypeUtil.visit(schema, new TypeToFlinkType()); + } + + /** + * Convert a {@link Type} to a {@link LogicalType Flink type}. + * + * @param type a Type + * @return the equivalent Flink type + * @throws IllegalArgumentException if the type cannot be converted to Flink + */ + public static LogicalType convert(Type type) { + return TypeUtil.visit(type, new TypeToFlinkType()); + } + + /** + * Convert a {@link LogicalType Flink type} to a {@link Type}. + * + * @param flinkType a FlinkType + * @return the equivalent Iceberg type + */ + public static Type convert(LogicalType flinkType) { + return flinkType.accept(new FlinkTypeToType()); + } + + /** + * Convert a {@link RowType} to a {@link TableSchema}. + * + * @param rowType a RowType + * @return Flink TableSchema + * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link #toResolvedSchema(RowType)} + * instead + */ + @Deprecated + public static TableSchema toSchema(RowType rowType) { + TableSchema.Builder builder = TableSchema.builder(); + for (RowType.RowField field : rowType.getFields()) { + builder.field(field.getName(), TypeConversions.fromLogicalToDataType(field.getType())); + } + return builder.build(); + } + + /** + * Convert a {@link RowType} to a {@link ResolvedSchema}. + * + * @param rowType a RowType + * @return Flink ResolvedSchema + */ + public static ResolvedSchema toResolvedSchema(RowType rowType) { + List columns = Lists.newArrayListWithExpectedSize(rowType.getFieldCount()); + for (RowType.RowField field : rowType.getFields()) { + columns.add( + Column.physical(field.getName(), TypeConversions.fromLogicalToDataType(field.getType()))); + } + + return ResolvedSchema.of(columns); + } + + /** + * Convert a {@link Schema} to a {@link TableSchema}. + * + * @param schema iceberg schema to convert. + * @return Flink TableSchema. + * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link #toResolvedSchema(Schema)} + * instead + */ + @Deprecated + public static TableSchema toSchema(Schema schema) { + TableSchema.Builder builder = TableSchema.builder(); + + // Add columns. + for (RowType.RowField field : convert(schema).getFields()) { + builder.field(field.getName(), TypeConversions.fromLogicalToDataType(field.getType())); + } + + // Add primary key. + Set identifierFieldIds = schema.identifierFieldIds(); + if (!identifierFieldIds.isEmpty()) { + List columns = Lists.newArrayListWithExpectedSize(identifierFieldIds.size()); + for (Integer identifierFieldId : identifierFieldIds) { + String columnName = schema.findColumnName(identifierFieldId); + Preconditions.checkNotNull( + columnName, "Cannot find field with id %s in schema %s", identifierFieldId, schema); + + columns.add(columnName); + } + builder.primaryKey(columns.toArray(new String[0])); + } + + return builder.build(); + } + + /** + * Convert a {@link Schema} to a {@link ResolvedSchema}. + * + * @param schema iceberg schema to convert. + * @return Flink ResolvedSchema. + */ + public static ResolvedSchema toResolvedSchema(Schema schema) { + RowType rowType = convert(schema); + List columns = Lists.newArrayListWithExpectedSize(rowType.getFieldCount()); + + // Add columns. + for (RowType.RowField field : rowType.getFields()) { + columns.add( + Column.physical(field.getName(), TypeConversions.fromLogicalToDataType(field.getType()))); + } + + // Add primary key. + Set identifierFieldIds = schema.identifierFieldIds(); + UniqueConstraint uniqueConstraint = null; + if (!identifierFieldIds.isEmpty()) { + List primaryKeyColumns = + Lists.newArrayListWithExpectedSize(identifierFieldIds.size()); + for (Integer identifierFieldId : identifierFieldIds) { + String columnName = schema.findColumnName(identifierFieldId); + Preconditions.checkNotNull( + columnName, "Cannot find field with id %s in schema %s", identifierFieldId, schema); + + primaryKeyColumns.add(columnName); + } + + uniqueConstraint = + UniqueConstraint.primaryKey(UUID.randomUUID().toString(), primaryKeyColumns); + + validatePrimaryKey(uniqueConstraint, columns); + } + + return new ResolvedSchema(columns, Collections.emptyList(), uniqueConstraint); + } + + /** + * Copied from + * org.apache.flink.table.catalog.DefaultSchemaResolver#validatePrimaryKey(org.apache.flink.table.catalog.UniqueConstraint, + * java.util.List) + */ + private static void validatePrimaryKey(UniqueConstraint primaryKey, List columns) { + final Map columnsByNameLookup = + columns.stream().collect(Collectors.toMap(Column::getName, Function.identity())); + + final Set duplicateColumns = + primaryKey.getColumns().stream() + .filter(name -> Collections.frequency(primaryKey.getColumns(), name) > 1) + .collect(Collectors.toSet()); + + if (!duplicateColumns.isEmpty()) { + throw new ValidationException( + String.format( + "Invalid primary key '%s'. A primary key must not contain duplicate columns. Found: %s", + primaryKey.getName(), duplicateColumns)); + } + + for (String columnName : primaryKey.getColumns()) { + Column column = columnsByNameLookup.get(columnName); + if (column == null) { + throw new ValidationException( + String.format( + "Invalid primary key '%s'. Column '%s' does not exist.", + primaryKey.getName(), columnName)); + } + + if (!column.isPhysical()) { + throw new ValidationException( + String.format( + "Invalid primary key '%s'. Column '%s' is not a physical column.", + primaryKey.getName(), columnName)); + } + + final LogicalType columnType = column.getDataType().getLogicalType(); + if (columnType.isNullable()) { + throw new ValidationException( + String.format( + "Invalid primary key '%s'. Column '%s' is nullable.", + primaryKey.getName(), columnName)); + } + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java new file mode 100644 index 000000000000..5fbd84909d69 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import org.apache.flink.api.common.functions.FilterFunction; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.expressions.Evaluator; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.types.Types; + +public class FlinkSourceFilter implements FilterFunction { + + private final RowType rowType; + private final Evaluator evaluator; + private final Types.StructType struct; + private volatile RowDataWrapper wrapper; + + public FlinkSourceFilter(Schema schema, Expression expr, boolean caseSensitive) { + this.rowType = FlinkSchemaUtil.convert(schema); + this.struct = schema.asStruct(); + this.evaluator = new Evaluator(struct, expr, caseSensitive); + } + + @Override + public boolean filter(RowData value) { + if (wrapper == null) { + this.wrapper = new RowDataWrapper(rowType, struct); + } + return evaluator.eval(wrapper.wrap(value)); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java new file mode 100644 index 000000000000..408065f06057 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.BigIntType; +import org.apache.flink.table.types.logical.BinaryType; +import org.apache.flink.table.types.logical.BooleanType; +import org.apache.flink.table.types.logical.CharType; +import org.apache.flink.table.types.logical.DateType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.DoubleType; +import org.apache.flink.table.types.logical.FloatType; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.MultisetType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.SmallIntType; +import org.apache.flink.table.types.logical.TimeType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.table.types.logical.TinyIntType; +import org.apache.flink.table.types.logical.VarBinaryType; +import org.apache.flink.table.types.logical.VarCharType; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; + +class FlinkTypeToType extends FlinkTypeVisitor { + + private final RowType root; + private int nextId; + + FlinkTypeToType() { + this.root = null; + } + + FlinkTypeToType(RowType root) { + this.root = root; + // the root struct's fields use the first ids + this.nextId = root.getFieldCount(); + } + + private int getNextId() { + int next = nextId; + nextId += 1; + return next; + } + + @Override + public Type visit(CharType charType) { + return Types.StringType.get(); + } + + @Override + public Type visit(VarCharType varCharType) { + return Types.StringType.get(); + } + + @Override + public Type visit(BooleanType booleanType) { + return Types.BooleanType.get(); + } + + @Override + public Type visit(BinaryType binaryType) { + return Types.FixedType.ofLength(binaryType.getLength()); + } + + @Override + public Type visit(VarBinaryType varBinaryType) { + return Types.BinaryType.get(); + } + + @Override + public Type visit(DecimalType decimalType) { + return Types.DecimalType.of(decimalType.getPrecision(), decimalType.getScale()); + } + + @Override + public Type visit(TinyIntType tinyIntType) { + return Types.IntegerType.get(); + } + + @Override + public Type visit(SmallIntType smallIntType) { + return Types.IntegerType.get(); + } + + @Override + public Type visit(IntType intType) { + return Types.IntegerType.get(); + } + + @Override + public Type visit(BigIntType bigIntType) { + return Types.LongType.get(); + } + + @Override + public Type visit(FloatType floatType) { + return Types.FloatType.get(); + } + + @Override + public Type visit(DoubleType doubleType) { + return Types.DoubleType.get(); + } + + @Override + public Type visit(DateType dateType) { + return Types.DateType.get(); + } + + @Override + public Type visit(TimeType timeType) { + return Types.TimeType.get(); + } + + @Override + public Type visit(TimestampType timestampType) { + return Types.TimestampType.withoutZone(); + } + + @Override + public Type visit(LocalZonedTimestampType localZonedTimestampType) { + return Types.TimestampType.withZone(); + } + + @Override + public Type visit(ArrayType arrayType) { + Type elementType = arrayType.getElementType().accept(this); + if (arrayType.getElementType().isNullable()) { + return Types.ListType.ofOptional(getNextId(), elementType); + } else { + return Types.ListType.ofRequired(getNextId(), elementType); + } + } + + @Override + public Type visit(MultisetType multisetType) { + Type elementType = multisetType.getElementType().accept(this); + return Types.MapType.ofRequired(getNextId(), getNextId(), elementType, Types.IntegerType.get()); + } + + @Override + public Type visit(MapType mapType) { + // keys in map are not allowed to be null. + Type keyType = mapType.getKeyType().accept(this); + Type valueType = mapType.getValueType().accept(this); + if (mapType.getValueType().isNullable()) { + return Types.MapType.ofOptional(getNextId(), getNextId(), keyType, valueType); + } else { + return Types.MapType.ofRequired(getNextId(), getNextId(), keyType, valueType); + } + } + + @Override + @SuppressWarnings("ReferenceEquality") + public Type visit(RowType rowType) { + List newFields = Lists.newArrayListWithExpectedSize(rowType.getFieldCount()); + boolean isRoot = root == rowType; + + List types = + rowType.getFields().stream() + .map(f -> f.getType().accept(this)) + .collect(Collectors.toList()); + + for (int i = 0; i < rowType.getFieldCount(); i++) { + int id = isRoot ? i : getNextId(); + + RowType.RowField field = rowType.getFields().get(i); + String name = field.getName(); + String comment = field.getDescription().orElse(null); + + if (field.getType().isNullable()) { + newFields.add(Types.NestedField.optional(id, name, types.get(i), comment)); + } else { + newFields.add(Types.NestedField.required(id, name, types.get(i), comment)); + } + } + + return Types.StructType.of(newFields); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java new file mode 100644 index 000000000000..f3de2416088c --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import org.apache.flink.table.types.logical.DayTimeIntervalType; +import org.apache.flink.table.types.logical.DistinctType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.LogicalTypeVisitor; +import org.apache.flink.table.types.logical.NullType; +import org.apache.flink.table.types.logical.RawType; +import org.apache.flink.table.types.logical.StructuredType; +import org.apache.flink.table.types.logical.SymbolType; +import org.apache.flink.table.types.logical.YearMonthIntervalType; +import org.apache.flink.table.types.logical.ZonedTimestampType; + +public abstract class FlinkTypeVisitor implements LogicalTypeVisitor { + + // ------------------------- Unsupported types ------------------------------ + + @Override + public T visit(ZonedTimestampType zonedTimestampType) { + throw new UnsupportedOperationException("Unsupported ZonedTimestampType."); + } + + @Override + public T visit(YearMonthIntervalType yearMonthIntervalType) { + throw new UnsupportedOperationException("Unsupported YearMonthIntervalType."); + } + + @Override + public T visit(DayTimeIntervalType dayTimeIntervalType) { + throw new UnsupportedOperationException("Unsupported DayTimeIntervalType."); + } + + @Override + public T visit(DistinctType distinctType) { + throw new UnsupportedOperationException("Unsupported DistinctType."); + } + + @Override + public T visit(StructuredType structuredType) { + throw new UnsupportedOperationException("Unsupported StructuredType."); + } + + @Override + public T visit(NullType nullType) { + throw new UnsupportedOperationException("Unsupported NullType."); + } + + @Override + public T visit(RawType rawType) { + throw new UnsupportedOperationException("Unsupported RawType."); + } + + @Override + public T visit(SymbolType symbolType) { + throw new UnsupportedOperationException("Unsupported SymbolType."); + } + + @Override + public T visit(LogicalType other) { + throw new UnsupportedOperationException("Unsupported type: " + other); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java new file mode 100644 index 000000000000..222a1e810468 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.time.Duration; +import java.util.Map; +import org.apache.flink.annotation.Experimental; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.sink.shuffle.StatisticsType; + +/** + * A class for common Iceberg configs for Flink writes. + * + *

If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * + *

    + *
  1. Write options + *
  2. flink ReadableConfig + *
  3. Table metadata + *
+ * + * The most specific value is set in write options and takes precedence over all other configs. If + * no write option is provided, this class checks the flink configuration for any overrides. If no + * applicable value is found in the write options, this class uses the table metadata. + * + *

Note this class is NOT meant to be serialized. + */ +public class FlinkWriteConf { + + private final FlinkConfParser confParser; + + public FlinkWriteConf( + Table table, Map writeOptions, ReadableConfig readableConfig) { + this.confParser = new FlinkConfParser(table, writeOptions, readableConfig); + } + + public FlinkWriteConf(Map writeOptions, ReadableConfig readableConfig) { + this.confParser = new FlinkConfParser(writeOptions, readableConfig); + } + + public boolean overwriteMode() { + return confParser + .booleanConf() + .option(FlinkWriteOptions.OVERWRITE_MODE.key()) + .flinkConfig(FlinkWriteOptions.OVERWRITE_MODE) + .defaultValue(FlinkWriteOptions.OVERWRITE_MODE.defaultValue()) + .parse(); + } + + public boolean upsertMode() { + return confParser + .booleanConf() + .option(FlinkWriteOptions.WRITE_UPSERT_ENABLED.key()) + .flinkConfig(FlinkWriteOptions.WRITE_UPSERT_ENABLED) + .tableProperty(TableProperties.UPSERT_ENABLED) + .defaultValue(TableProperties.UPSERT_ENABLED_DEFAULT) + .parse(); + } + + public FileFormat dataFileFormat() { + String valueAsString = + confParser + .stringConf() + .option(FlinkWriteOptions.WRITE_FORMAT.key()) + .flinkConfig(FlinkWriteOptions.WRITE_FORMAT) + .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) + .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) + .parse(); + return FileFormat.fromString(valueAsString); + } + + public long targetDataFileSize() { + return confParser + .longConf() + .option(FlinkWriteOptions.TARGET_FILE_SIZE_BYTES.key()) + .flinkConfig(FlinkWriteOptions.TARGET_FILE_SIZE_BYTES) + .tableProperty(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES) + .defaultValue(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT) + .parse(); + } + + public String parquetCompressionCodec() { + return confParser + .stringConf() + .option(FlinkWriteOptions.COMPRESSION_CODEC.key()) + .flinkConfig(FlinkWriteOptions.COMPRESSION_CODEC) + .tableProperty(TableProperties.PARQUET_COMPRESSION) + .defaultValue(TableProperties.PARQUET_COMPRESSION_DEFAULT) + .parse(); + } + + public String parquetCompressionLevel() { + return confParser + .stringConf() + .option(FlinkWriteOptions.COMPRESSION_LEVEL.key()) + .flinkConfig(FlinkWriteOptions.COMPRESSION_LEVEL) + .tableProperty(TableProperties.PARQUET_COMPRESSION_LEVEL) + .defaultValue(TableProperties.PARQUET_COMPRESSION_LEVEL_DEFAULT) + .parseOptional(); + } + + public String avroCompressionCodec() { + return confParser + .stringConf() + .option(FlinkWriteOptions.COMPRESSION_CODEC.key()) + .flinkConfig(FlinkWriteOptions.COMPRESSION_CODEC) + .tableProperty(TableProperties.AVRO_COMPRESSION) + .defaultValue(TableProperties.AVRO_COMPRESSION_DEFAULT) + .parse(); + } + + public String avroCompressionLevel() { + return confParser + .stringConf() + .option(FlinkWriteOptions.COMPRESSION_LEVEL.key()) + .flinkConfig(FlinkWriteOptions.COMPRESSION_LEVEL) + .tableProperty(TableProperties.AVRO_COMPRESSION_LEVEL) + .defaultValue(TableProperties.AVRO_COMPRESSION_LEVEL_DEFAULT) + .parseOptional(); + } + + public String orcCompressionCodec() { + return confParser + .stringConf() + .option(FlinkWriteOptions.COMPRESSION_CODEC.key()) + .flinkConfig(FlinkWriteOptions.COMPRESSION_CODEC) + .tableProperty(TableProperties.ORC_COMPRESSION) + .defaultValue(TableProperties.ORC_COMPRESSION_DEFAULT) + .parse(); + } + + public String orcCompressionStrategy() { + return confParser + .stringConf() + .option(FlinkWriteOptions.COMPRESSION_STRATEGY.key()) + .flinkConfig(FlinkWriteOptions.COMPRESSION_STRATEGY) + .tableProperty(TableProperties.ORC_COMPRESSION_STRATEGY) + .defaultValue(TableProperties.ORC_COMPRESSION_STRATEGY_DEFAULT) + .parse(); + } + + public DistributionMode distributionMode() { + String modeName = + confParser + .stringConf() + .option(FlinkWriteOptions.DISTRIBUTION_MODE.key()) + .flinkConfig(FlinkWriteOptions.DISTRIBUTION_MODE) + .tableProperty(TableProperties.WRITE_DISTRIBUTION_MODE) + .defaultValue(TableProperties.WRITE_DISTRIBUTION_MODE_NONE) + .parse(); + return DistributionMode.fromName(modeName); + } + + public StatisticsType rangeDistributionStatisticsType() { + String name = + confParser + .stringConf() + .option(FlinkWriteOptions.RANGE_DISTRIBUTION_STATISTICS_TYPE.key()) + .flinkConfig(FlinkWriteOptions.RANGE_DISTRIBUTION_STATISTICS_TYPE) + .defaultValue(FlinkWriteOptions.RANGE_DISTRIBUTION_STATISTICS_TYPE.defaultValue()) + .parse(); + return StatisticsType.valueOf(name); + } + + public double rangeDistributionSortKeyBaseWeight() { + return confParser + .doubleConf() + .option(FlinkWriteOptions.RANGE_DISTRIBUTION_SORT_KEY_BASE_WEIGHT.key()) + .flinkConfig(FlinkWriteOptions.RANGE_DISTRIBUTION_SORT_KEY_BASE_WEIGHT) + .defaultValue(FlinkWriteOptions.RANGE_DISTRIBUTION_SORT_KEY_BASE_WEIGHT.defaultValue()) + .parse(); + } + + public int workerPoolSize() { + return confParser + .intConf() + .flinkConfig(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE) + .defaultValue(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue()) + .parse(); + } + + public String branch() { + return confParser + .stringConf() + .option(FlinkWriteOptions.BRANCH.key()) + .defaultValue(FlinkWriteOptions.BRANCH.defaultValue()) + .parse(); + } + + public Integer writeParallelism() { + return confParser.intConf().option(FlinkWriteOptions.WRITE_PARALLELISM.key()).parseOptional(); + } + + public boolean compactMode() { + return confParser + .booleanConf() + .option(FlinkWriteOptions.COMPACTION_ENABLE.key()) + .flinkConfig(FlinkWriteOptions.COMPACTION_ENABLE) + .defaultValue(FlinkWriteOptions.COMPACTION_ENABLE.defaultValue()) + .parse(); + } + + /** + * NOTE: This may be removed or changed in a future release. This value specifies the interval for + * refreshing the table instances in sink writer subtasks. If not specified then the default + * behavior is to not refresh the table. + * + * @return the interval for refreshing the table in sink writer subtasks + */ + @Experimental + public Duration tableRefreshInterval() { + return confParser + .durationConf() + .option(FlinkWriteOptions.TABLE_REFRESH_INTERVAL.key()) + .flinkConfig(FlinkWriteOptions.TABLE_REFRESH_INTERVAL) + .parseOptional(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java new file mode 100644 index 000000000000..6bdb01c3f5d3 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.time.Duration; +import org.apache.flink.annotation.Experimental; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.flink.sink.shuffle.StatisticsType; + +/** Flink sink write options */ +public class FlinkWriteOptions { + + private FlinkWriteOptions() {} + + // File format for write operations(default: Table write.format.default ) + public static final ConfigOption WRITE_FORMAT = + ConfigOptions.key("write-format").stringType().noDefaultValue(); + + // Overrides this table's write.target-file-size-bytes + public static final ConfigOption TARGET_FILE_SIZE_BYTES = + ConfigOptions.key("target-file-size-bytes").longType().noDefaultValue(); + + // Overrides this table's write..compression-codec + public static final ConfigOption COMPRESSION_CODEC = + ConfigOptions.key("compression-codec").stringType().noDefaultValue(); + + // Overrides this table's write..compression-level + public static final ConfigOption COMPRESSION_LEVEL = + ConfigOptions.key("compression-level").stringType().noDefaultValue(); + + // Overrides this table's write..compression-strategy + public static final ConfigOption COMPRESSION_STRATEGY = + ConfigOptions.key("compression-strategy").stringType().noDefaultValue(); + + // Overrides this table's write.upsert.enabled + public static final ConfigOption WRITE_UPSERT_ENABLED = + ConfigOptions.key("upsert-enabled").booleanType().noDefaultValue(); + + public static final ConfigOption OVERWRITE_MODE = + ConfigOptions.key("overwrite-enabled").booleanType().defaultValue(false); + + // Overrides the table's write.distribution-mode + public static final ConfigOption DISTRIBUTION_MODE = + ConfigOptions.key("distribution-mode").stringType().noDefaultValue(); + + public static final ConfigOption RANGE_DISTRIBUTION_STATISTICS_TYPE = + ConfigOptions.key("range-distribution-statistics-type") + .stringType() + .defaultValue(StatisticsType.Auto.name()) + .withDescription("Type of statistics collection: Auto, Map, Sketch"); + + public static final ConfigOption RANGE_DISTRIBUTION_SORT_KEY_BASE_WEIGHT = + ConfigOptions.key("range-distribution-sort-key-base-weight") + .doubleType() + .defaultValue(0.0d) + .withDescription( + "Base weight for every sort key relative to target weight per writer task"); + + // Branch to write to + public static final ConfigOption BRANCH = + ConfigOptions.key("branch").stringType().defaultValue(SnapshotRef.MAIN_BRANCH); + + public static final ConfigOption WRITE_PARALLELISM = + ConfigOptions.key("write-parallelism").intType().noDefaultValue(); + + public static final ConfigOption COMPACTION_ENABLE = + ConfigOptions.key("compaction-enabled").booleanType().defaultValue(false); + + @Experimental + public static final ConfigOption TABLE_REFRESH_INTERVAL = + ConfigOptions.key("table-refresh-interval").durationType().noDefaultValue(); +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java new file mode 100644 index 000000000000..c8c11474177c --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java @@ -0,0 +1,182 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.util.List; +import java.util.Map; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.catalog.UniqueConstraint; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.table.connector.sink.DataStreamSinkProvider; +import org.apache.flink.table.connector.sink.DynamicTableSink; +import org.apache.flink.table.connector.sink.abilities.SupportsOverwrite; +import org.apache.flink.table.connector.sink.abilities.SupportsPartitioning; +import org.apache.flink.table.legacy.api.TableSchema; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.flink.sink.FlinkSink; +import org.apache.iceberg.flink.sink.IcebergSink; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; + +public class IcebergTableSink implements DynamicTableSink, SupportsPartitioning, SupportsOverwrite { + private final TableLoader tableLoader; + @Deprecated private final TableSchema tableSchema; + private final ResolvedSchema resolvedSchema; + private final ReadableConfig readableConfig; + private final Map writeProps; + + private boolean overwrite = false; + + private IcebergTableSink(IcebergTableSink toCopy) { + this.tableLoader = toCopy.tableLoader; + this.tableSchema = toCopy.tableSchema; + this.resolvedSchema = toCopy.resolvedSchema; + this.overwrite = toCopy.overwrite; + this.readableConfig = toCopy.readableConfig; + this.writeProps = toCopy.writeProps; + } + + /** + * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link #IcebergTableSink(TableLoader, + * ResolvedSchema, ReadableConfig, Map)} instead + */ + @Deprecated + public IcebergTableSink( + TableLoader tableLoader, + TableSchema tableSchema, + ReadableConfig readableConfig, + Map writeProps) { + this.tableLoader = tableLoader; + this.tableSchema = tableSchema; + this.resolvedSchema = null; + this.readableConfig = readableConfig; + this.writeProps = writeProps; + } + + public IcebergTableSink( + TableLoader tableLoader, + ResolvedSchema resolvedSchema, + ReadableConfig readableConfig, + Map writeProps) { + this.tableLoader = tableLoader; + this.tableSchema = null; + this.resolvedSchema = resolvedSchema; + this.readableConfig = readableConfig; + this.writeProps = writeProps; + } + + @Override + public SinkRuntimeProvider getSinkRuntimeProvider(Context context) { + Preconditions.checkState( + !overwrite || context.isBounded(), + "Unbounded data stream doesn't support overwrite operation."); + + if (resolvedSchema != null) { + List equalityColumns = + resolvedSchema + .getPrimaryKey() + .map(UniqueConstraint::getColumns) + .orElseGet(ImmutableList::of); + + return (DataStreamSinkProvider) + (providerContext, dataStream) -> { + if (Boolean.TRUE.equals( + readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_V2_SINK))) { + return IcebergSink.forRowData(dataStream) + .tableLoader(tableLoader) + .resolvedSchema(resolvedSchema) + .equalityFieldColumns(equalityColumns) + .overwrite(overwrite) + .setAll(writeProps) + .flinkConf(readableConfig) + .append(); + } else { + return FlinkSink.forRowData(dataStream) + .tableLoader(tableLoader) + .resolvedSchema(resolvedSchema) + .equalityFieldColumns(equalityColumns) + .overwrite(overwrite) + .setAll(writeProps) + .flinkConf(readableConfig) + .append(); + } + }; + } else { + List equalityColumns = + tableSchema + .getPrimaryKey() + .map(org.apache.flink.table.legacy.api.constraints.UniqueConstraint::getColumns) + .orElseGet(ImmutableList::of); + + return (DataStreamSinkProvider) + (providerContext, dataStream) -> { + if (readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_V2_SINK)) { + return IcebergSink.forRowData(dataStream) + .tableLoader(tableLoader) + .tableSchema(tableSchema) + .equalityFieldColumns(equalityColumns) + .overwrite(overwrite) + .setAll(writeProps) + .flinkConf(readableConfig) + .append(); + } else { + return FlinkSink.forRowData(dataStream) + .tableLoader(tableLoader) + .tableSchema(tableSchema) + .equalityFieldColumns(equalityColumns) + .overwrite(overwrite) + .setAll(writeProps) + .flinkConf(readableConfig) + .append(); + } + }; + } + } + + @Override + public void applyStaticPartition(Map partition) { + // The flink's PartitionFanoutWriter will handle the static partition write policy + // automatically. + } + + @Override + public ChangelogMode getChangelogMode(ChangelogMode requestedMode) { + ChangelogMode.Builder builder = ChangelogMode.newBuilder(); + for (RowKind kind : requestedMode.getContainedKinds()) { + builder.addContainedKind(kind); + } + return builder.build(); + } + + @Override + public DynamicTableSink copy() { + return new IcebergTableSink(this); + } + + @Override + public String asSummaryString() { + return "Iceberg table sink"; + } + + @Override + public void applyOverwrite(boolean newOverwrite) { + this.overwrite = newOverwrite; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java new file mode 100644 index 000000000000..3ef611f2ded5 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.lang.reflect.Array; +import java.nio.ByteBuffer; +import java.time.LocalDateTime; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.DateTimeUtil; +import org.apache.iceberg.util.UUIDUtil; + +public class RowDataWrapper implements StructLike { + + private final LogicalType[] types; + private final PositionalGetter[] getters; + private RowData rowData = null; + + public RowDataWrapper(RowType rowType, Types.StructType struct) { + int size = rowType.getFieldCount(); + + types = (LogicalType[]) Array.newInstance(LogicalType.class, size); + getters = (PositionalGetter[]) Array.newInstance(PositionalGetter.class, size); + + for (int i = 0; i < size; i++) { + types[i] = rowType.getTypeAt(i); + getters[i] = buildGetter(types[i], struct.fields().get(i).type()); + } + } + + public RowDataWrapper wrap(RowData data) { + this.rowData = data; + return this; + } + + @Override + public int size() { + return types.length; + } + + @Override + public T get(int pos, Class javaClass) { + if (rowData.isNullAt(pos)) { + return null; + } else if (getters[pos] != null) { + return javaClass.cast(getters[pos].get(rowData, pos)); + } + + Object value = FlinkRowData.createFieldGetter(types[pos], pos).getFieldOrNull(rowData); + return javaClass.cast(value); + } + + @Override + public void set(int pos, T value) { + throw new UnsupportedOperationException( + "Could not set a field in the RowDataWrapper because rowData is read-only"); + } + + private interface PositionalGetter { + T get(RowData data, int pos); + } + + private static PositionalGetter buildGetter(LogicalType logicalType, Type type) { + switch (logicalType.getTypeRoot()) { + case TINYINT: + return (row, pos) -> (int) row.getByte(pos); + case SMALLINT: + return (row, pos) -> (int) row.getShort(pos); + case CHAR: + case VARCHAR: + return (row, pos) -> row.getString(pos).toString(); + + case BINARY: + case VARBINARY: + if (Type.TypeID.UUID == type.typeId()) { + return (row, pos) -> UUIDUtil.convert(row.getBinary(pos)); + } else { + return (row, pos) -> ByteBuffer.wrap(row.getBinary(pos)); + } + + case DECIMAL: + DecimalType decimalType = (DecimalType) logicalType; + return (row, pos) -> + row.getDecimal(pos, decimalType.getPrecision(), decimalType.getScale()).toBigDecimal(); + + case TIME_WITHOUT_TIME_ZONE: + // Time in RowData is in milliseconds (Integer), while iceberg's time is microseconds + // (Long). + return (row, pos) -> ((long) row.getInt(pos)) * 1_000; + + case TIMESTAMP_WITHOUT_TIME_ZONE: + TimestampType timestampType = (TimestampType) logicalType; + return (row, pos) -> { + LocalDateTime localDateTime = + row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); + return DateTimeUtil.microsFromTimestamp(localDateTime); + }; + + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + LocalZonedTimestampType lzTs = (LocalZonedTimestampType) logicalType; + return (row, pos) -> { + TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); + return timestampData.getMillisecond() * 1000 + + timestampData.getNanoOfMillisecond() / 1000; + }; + + case ROW: + RowType rowType = (RowType) logicalType; + Types.StructType structType = (Types.StructType) type; + + RowDataWrapper nestedWrapper = new RowDataWrapper(rowType, structType); + return (row, pos) -> nestedWrapper.wrap(row.getRow(pos, rowType.getFieldCount())); + + default: + return null; + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java new file mode 100644 index 000000000000..da509451fee7 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.io.Closeable; +import java.io.IOException; +import java.io.Serializable; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.hadoop.SerializableConfiguration; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; + +/** + * Serializable loader to load an Iceberg {@link Table}. Flink needs to get {@link Table} objects in + * the cluster (for example, to get splits), not just on the client side. So we need an Iceberg + * table loader to get the {@link Table} object. + */ +public interface TableLoader extends Closeable, Serializable, Cloneable { + + void open(); + + boolean isOpen(); + + Table loadTable(); + + /** Clone a TableLoader */ + @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) + TableLoader clone(); + + static TableLoader fromCatalog(CatalogLoader catalogLoader, TableIdentifier identifier) { + return new CatalogTableLoader(catalogLoader, identifier); + } + + static TableLoader fromHadoopTable(String location) { + return fromHadoopTable(location, FlinkCatalogFactory.clusterHadoopConf()); + } + + static TableLoader fromHadoopTable(String location, Configuration hadoopConf) { + return new HadoopTableLoader(location, hadoopConf); + } + + class HadoopTableLoader implements TableLoader { + + private static final long serialVersionUID = 1L; + + private final String location; + private final SerializableConfiguration hadoopConf; + + private transient HadoopTables tables; + + private HadoopTableLoader(String location, Configuration conf) { + this.location = location; + this.hadoopConf = new SerializableConfiguration(conf); + } + + @Override + public void open() { + tables = new HadoopTables(hadoopConf.get()); + } + + @Override + public boolean isOpen() { + return tables != null; + } + + @Override + public Table loadTable() { + FlinkEnvironmentContext.init(); + return tables.load(location); + } + + @Override + @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) + public TableLoader clone() { + return new HadoopTableLoader(location, new Configuration(hadoopConf.get())); + } + + @Override + public void close() {} + + @Override + public String toString() { + return MoreObjects.toStringHelper(this).add("location", location).toString(); + } + } + + class CatalogTableLoader implements TableLoader { + + private static final long serialVersionUID = 1L; + + private final CatalogLoader catalogLoader; + private final String identifier; + + private transient Catalog catalog; + + private CatalogTableLoader(CatalogLoader catalogLoader, TableIdentifier tableIdentifier) { + this.catalogLoader = catalogLoader; + this.identifier = tableIdentifier.toString(); + } + + @Override + public void open() { + catalog = catalogLoader.loadCatalog(); + } + + @Override + public boolean isOpen() { + return catalog != null; + } + + @Override + public Table loadTable() { + FlinkEnvironmentContext.init(); + return catalog.loadTable(TableIdentifier.parse(identifier)); + } + + @Override + public void close() throws IOException { + if (catalog instanceof Closeable) { + ((Closeable) catalog).close(); + } + + catalog = null; + } + + @Override + @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) + public TableLoader clone() { + return new CatalogTableLoader(catalogLoader.clone(), TableIdentifier.parse(identifier)); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("tableIdentifier", identifier) + .add("catalogLoader", catalogLoader) + .toString(); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java new file mode 100644 index 000000000000..72a646991456 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.util.List; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.BigIntType; +import org.apache.flink.table.types.logical.BinaryType; +import org.apache.flink.table.types.logical.BooleanType; +import org.apache.flink.table.types.logical.DateType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.DoubleType; +import org.apache.flink.table.types.logical.FloatType; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.NullType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimeType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.table.types.logical.VarBinaryType; +import org.apache.flink.table.types.logical.VarCharType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.types.Types; + +class TypeToFlinkType extends TypeUtil.SchemaVisitor { + TypeToFlinkType() {} + + @Override + public LogicalType schema(Schema schema, LogicalType structType) { + return structType; + } + + @Override + public LogicalType struct(Types.StructType struct, List fieldResults) { + List fields = struct.fields(); + + List flinkFields = Lists.newArrayListWithExpectedSize(fieldResults.size()); + for (int i = 0; i < fields.size(); i += 1) { + Types.NestedField field = fields.get(i); + LogicalType type = fieldResults.get(i); + RowType.RowField flinkField = + new RowType.RowField(field.name(), type.copy(field.isOptional()), field.doc()); + flinkFields.add(flinkField); + } + + return new RowType(flinkFields); + } + + @Override + public LogicalType field(Types.NestedField field, LogicalType fieldResult) { + return fieldResult; + } + + @Override + public LogicalType list(Types.ListType list, LogicalType elementResult) { + return new ArrayType(elementResult.copy(list.isElementOptional())); + } + + @Override + public LogicalType map(Types.MapType map, LogicalType keyResult, LogicalType valueResult) { + // keys in map are not allowed to be null. + return new MapType(keyResult.copy(false), valueResult.copy(map.isValueOptional())); + } + + @Override + public LogicalType primitive(Type.PrimitiveType primitive) { + switch (primitive.typeId()) { + case UNKNOWN: + return new NullType(); + case BOOLEAN: + return new BooleanType(); + case INTEGER: + return new IntType(); + case LONG: + return new BigIntType(); + case FLOAT: + return new FloatType(); + case DOUBLE: + return new DoubleType(); + case DATE: + return new DateType(); + case TIME: + // For the type: Flink only support TimeType with default precision (second) now. The + // precision of time is + // not supported in Flink, so we can think of it as a simple time type directly. + // For the data: Flink uses int that support mills to represent time data, so it supports + // mills precision. + return new TimeType(); + case TIMESTAMP: + Types.TimestampType timestamp = (Types.TimestampType) primitive; + if (timestamp.shouldAdjustToUTC()) { + // MICROS + return new LocalZonedTimestampType(6); + } else { + // MICROS + return new TimestampType(6); + } + case TIMESTAMP_NANO: + Types.TimestampNanoType timestamp9 = (Types.TimestampNanoType) primitive; + if (timestamp9.shouldAdjustToUTC()) { + // NANOS + return new LocalZonedTimestampType(9); + } else { + // NANOS + return new TimestampType(9); + } + case STRING: + return new VarCharType(VarCharType.MAX_LENGTH); + case UUID: + // UUID length is 16 + return new BinaryType(16); + case FIXED: + Types.FixedType fixedType = (Types.FixedType) primitive; + return new BinaryType(fixedType.length()); + case BINARY: + return new VarBinaryType(VarBinaryType.MAX_LENGTH); + case DECIMAL: + Types.DecimalType decimal = (Types.DecimalType) primitive; + return new DecimalType(decimal.precision(), decimal.scale()); + default: + throw new UnsupportedOperationException( + "Cannot convert unknown type to Flink: " + primitive); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java new file mode 100644 index 000000000000..b96b47c5a785 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.actions; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.iceberg.Table; + +public class Actions { + + public static final Configuration CONFIG = + new Configuration() + // disable classloader check as Avro may cache class/object in the serializers. + .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); + + private final StreamExecutionEnvironment env; + private final Table table; + + private Actions(StreamExecutionEnvironment env, Table table) { + this.env = env; + this.table = table; + } + + public static Actions forTable(StreamExecutionEnvironment env, Table table) { + return new Actions(env, table); + } + + public static Actions forTable(Table table) { + return new Actions(StreamExecutionEnvironment.getExecutionEnvironment(CONFIG), table); + } + + public RewriteDataFilesAction rewriteDataFiles() { + return new RewriteDataFilesAction(env, table); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java new file mode 100644 index 000000000000..4cf30ed90418 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.actions; + +import java.util.List; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableUtil; +import org.apache.iceberg.actions.BaseRewriteDataFilesAction; +import org.apache.iceberg.flink.source.RowDataRewriter; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +public class RewriteDataFilesAction extends BaseRewriteDataFilesAction { + + private final StreamExecutionEnvironment env; + private int maxParallelism; + + public RewriteDataFilesAction(StreamExecutionEnvironment env, Table table) { + super(table); + this.env = env; + this.maxParallelism = env.getParallelism(); + Preconditions.checkArgument( + !TableUtil.supportsRowLineage(table), + "Flink does not support compaction on row lineage enabled tables (V3+)"); + } + + @Override + protected FileIO fileIO() { + return table().io(); + } + + @Override + protected List rewriteDataForTasks(List combinedScanTasks) { + int size = combinedScanTasks.size(); + int parallelism = Math.min(size, maxParallelism); + DataStream dataStream = env.fromCollection(combinedScanTasks); + RowDataRewriter rowDataRewriter = + new RowDataRewriter(table(), caseSensitive(), fileIO(), encryptionManager()); + try { + return rowDataRewriter.rewriteDataForTasks(dataStream, parallelism); + } catch (Exception e) { + throw new RuntimeException("Rewrite data file error.", e); + } + } + + @Override + protected RewriteDataFilesAction self() { + return this; + } + + public RewriteDataFilesAction maxParallelism(int parallelism) { + Preconditions.checkArgument(parallelism > 0, "Invalid max parallelism %s", parallelism); + this.maxParallelism = parallelism; + return this; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java new file mode 100644 index 000000000000..8103224a0b6c --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.LogicalTypeFamily; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.NullType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.avro.AvroWithPartnerByStructureVisitor; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.util.Pair; + +public abstract class AvroWithFlinkSchemaVisitor + extends AvroWithPartnerByStructureVisitor { + + @Override + protected boolean isStringType(LogicalType logicalType) { + return logicalType.getTypeRoot().getFamilies().contains(LogicalTypeFamily.CHARACTER_STRING); + } + + @Override + protected boolean isMapType(LogicalType logicalType) { + return logicalType instanceof MapType; + } + + @Override + protected LogicalType arrayElementType(LogicalType arrayType) { + Preconditions.checkArgument( + arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); + return ((ArrayType) arrayType).getElementType(); + } + + @Override + protected LogicalType mapKeyType(LogicalType mapType) { + Preconditions.checkArgument(isMapType(mapType), "Invalid map: %s is not a map", mapType); + return ((MapType) mapType).getKeyType(); + } + + @Override + protected LogicalType mapValueType(LogicalType mapType) { + Preconditions.checkArgument(isMapType(mapType), "Invalid map: %s is not a map", mapType); + return ((MapType) mapType).getValueType(); + } + + @Override + protected Pair fieldNameAndType(LogicalType structType, int pos) { + Preconditions.checkArgument( + structType instanceof RowType, "Invalid struct: %s is not a struct", structType); + RowType.RowField field = ((RowType) structType).getFields().get(pos); + return Pair.of(field.getName(), field.getType()); + } + + @Override + protected LogicalType nullType() { + return new NullType(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java new file mode 100644 index 000000000000..66ed95792e62 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.io.IOException; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.io.Encoder; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FieldMetrics; +import org.apache.iceberg.avro.MetricsAwareDatumWriter; +import org.apache.iceberg.avro.ValueWriter; +import org.apache.iceberg.avro.ValueWriters; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +public class FlinkAvroWriter implements MetricsAwareDatumWriter { + private final RowType rowType; + private ValueWriter writer = null; + + public FlinkAvroWriter(RowType rowType) { + this.rowType = rowType; + } + + @Override + @SuppressWarnings("unchecked") + public void setSchema(Schema schema) { + this.writer = + (ValueWriter) + AvroWithFlinkSchemaVisitor.visit(rowType, schema, new WriteBuilder()); + } + + @Override + public void write(RowData datum, Encoder out) throws IOException { + writer.write(datum, out); + } + + @Override + public Stream metrics() { + return writer.metrics(); + } + + private static class WriteBuilder extends AvroWithFlinkSchemaVisitor> { + @Override + public ValueWriter record( + LogicalType struct, Schema record, List names, List> fields) { + return FlinkValueWriters.row( + fields, + IntStream.range(0, names.size()) + .mapToObj(i -> fieldNameAndType(struct, i).second()) + .collect(Collectors.toList())); + } + + @Override + public ValueWriter union(LogicalType type, Schema union, List> options) { + Preconditions.checkArgument( + options.contains(ValueWriters.nulls()), + "Cannot create writer for non-option union: %s", + union); + Preconditions.checkArgument( + options.size() == 2, "Cannot create writer for non-option union: %s", union); + if (union.getTypes().get(0).getType() == Schema.Type.NULL) { + return ValueWriters.option(0, options.get(1)); + } else { + return ValueWriters.option(1, options.get(0)); + } + } + + @Override + public ValueWriter array(LogicalType sArray, Schema array, ValueWriter elementWriter) { + return FlinkValueWriters.array(elementWriter, arrayElementType(sArray)); + } + + @Override + public ValueWriter map(LogicalType sMap, Schema map, ValueWriter valueReader) { + return FlinkValueWriters.map( + FlinkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); + } + + @Override + public ValueWriter map( + LogicalType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { + return FlinkValueWriters.arrayMap( + keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); + } + + @Override + public ValueWriter primitive(LogicalType type, Schema primitive) { + org.apache.avro.LogicalType logicalType = primitive.getLogicalType(); + if (logicalType != null) { + switch (logicalType.getName()) { + case "date": + return ValueWriters.ints(); + + case "time-micros": + return FlinkValueWriters.timeMicros(); + + case "timestamp-micros": + return FlinkValueWriters.timestampMicros(); + + case "timestamp-nanos": + return FlinkValueWriters.timestampNanos(); + + case "decimal": + LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType; + return FlinkValueWriters.decimal(decimal.getPrecision(), decimal.getScale()); + + case "uuid": + return ValueWriters.uuids(); + + default: + throw new IllegalArgumentException("Unsupported logical type: " + logicalType); + } + } + + switch (primitive.getType()) { + case NULL: + return ValueWriters.nulls(); + case BOOLEAN: + return ValueWriters.booleans(); + case INT: + switch (type.getTypeRoot()) { + case TINYINT: + return ValueWriters.tinyints(); + case SMALLINT: + return ValueWriters.shorts(); + default: + return ValueWriters.ints(); + } + case LONG: + return ValueWriters.longs(); + case FLOAT: + return ValueWriters.floats(); + case DOUBLE: + return ValueWriters.doubles(); + case STRING: + return FlinkValueWriters.strings(); + case FIXED: + return ValueWriters.fixed(primitive.getFixedSize()); + case BYTES: + return ValueWriters.bytes(); + default: + throw new IllegalArgumentException("Unsupported type: " + primitive); + } + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java new file mode 100644 index 000000000000..65b9d44ad4b8 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.util.List; +import java.util.Map; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.orc.OrcRowReader; +import org.apache.iceberg.orc.OrcSchemaWithTypeVisitor; +import org.apache.iceberg.orc.OrcValueReader; +import org.apache.iceberg.orc.OrcValueReaders; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.orc.TypeDescription; +import org.apache.orc.storage.ql.exec.vector.StructColumnVector; +import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; + +public class FlinkOrcReader implements OrcRowReader { + private final OrcValueReader reader; + + public FlinkOrcReader(Schema iSchema, TypeDescription readSchema) { + this(iSchema, readSchema, ImmutableMap.of()); + } + + public FlinkOrcReader(Schema iSchema, TypeDescription readSchema, Map idToConstant) { + this.reader = + OrcSchemaWithTypeVisitor.visit(iSchema, readSchema, new ReadBuilder(idToConstant)); + } + + @Override + public RowData read(VectorizedRowBatch batch, int row) { + return (RowData) reader.read(new StructColumnVector(batch.size, batch.cols), row); + } + + @Override + public void setBatchContext(long batchOffsetInFile) { + reader.setBatchContext(batchOffsetInFile); + } + + private static class ReadBuilder extends OrcSchemaWithTypeVisitor> { + private final Map idToConstant; + + private ReadBuilder(Map idToConstant) { + this.idToConstant = idToConstant; + } + + @Override + public OrcValueReader record( + Types.StructType iStruct, + TypeDescription record, + List names, + List> fields) { + return FlinkOrcReaders.struct(fields, iStruct, idToConstant); + } + + @Override + public OrcValueReader list( + Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { + return FlinkOrcReaders.array(elementReader); + } + + @Override + public OrcValueReader map( + Types.MapType iMap, + TypeDescription map, + OrcValueReader keyReader, + OrcValueReader valueReader) { + return FlinkOrcReaders.map(keyReader, valueReader); + } + + @Override + public OrcValueReader primitive(Type.PrimitiveType iPrimitive, TypeDescription primitive) { + switch (iPrimitive.typeId()) { + case BOOLEAN: + return OrcValueReaders.booleans(); + case INTEGER: + return OrcValueReaders.ints(); + case LONG: + return OrcValueReaders.longs(); + case FLOAT: + return OrcValueReaders.floats(); + case DOUBLE: + return OrcValueReaders.doubles(); + case DATE: + return FlinkOrcReaders.dates(); + case TIME: + return FlinkOrcReaders.times(); + case TIMESTAMP: + Types.TimestampType timestampType = (Types.TimestampType) iPrimitive; + if (timestampType.shouldAdjustToUTC()) { + return FlinkOrcReaders.timestampTzs(); + } else { + return FlinkOrcReaders.timestamps(); + } + case STRING: + return FlinkOrcReaders.strings(); + case UUID: + case FIXED: + case BINARY: + return OrcValueReaders.bytes(); + case DECIMAL: + Types.DecimalType decimalType = (Types.DecimalType) iPrimitive; + return FlinkOrcReaders.decimals(decimalType.precision(), decimalType.scale()); + default: + throw new IllegalArgumentException( + String.format( + "Invalid iceberg type %s corresponding to ORC type %s", iPrimitive, primitive)); + } + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java new file mode 100644 index 000000000000..7a4a15c7e600 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.math.BigDecimal; +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.util.List; +import java.util.Map; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.iceberg.orc.OrcValueReader; +import org.apache.iceberg.orc.OrcValueReaders; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; +import org.apache.orc.storage.ql.exec.vector.ColumnVector; +import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; +import org.apache.orc.storage.ql.exec.vector.ListColumnVector; +import org.apache.orc.storage.ql.exec.vector.LongColumnVector; +import org.apache.orc.storage.ql.exec.vector.MapColumnVector; +import org.apache.orc.storage.ql.exec.vector.TimestampColumnVector; +import org.apache.orc.storage.serde2.io.HiveDecimalWritable; + +class FlinkOrcReaders { + private FlinkOrcReaders() {} + + static OrcValueReader strings() { + return StringReader.INSTANCE; + } + + static OrcValueReader dates() { + return DateReader.INSTANCE; + } + + static OrcValueReader decimals(int precision, int scale) { + if (precision <= 18) { + return new Decimal18Reader(precision, scale); + } else if (precision <= 38) { + return new Decimal38Reader(precision, scale); + } else { + throw new IllegalArgumentException("Invalid precision: " + precision); + } + } + + static OrcValueReader times() { + return TimeReader.INSTANCE; + } + + static OrcValueReader timestamps() { + return TimestampReader.INSTANCE; + } + + static OrcValueReader timestampTzs() { + return TimestampTzReader.INSTANCE; + } + + static OrcValueReader array(OrcValueReader elementReader) { + return new ArrayReader<>(elementReader); + } + + public static OrcValueReader map( + OrcValueReader keyReader, OrcValueReader valueReader) { + return new MapReader<>(keyReader, valueReader); + } + + public static OrcValueReader struct( + List> readers, Types.StructType struct, Map idToConstant) { + return new StructReader(readers, struct, idToConstant); + } + + private static class StringReader implements OrcValueReader { + private static final StringReader INSTANCE = new StringReader(); + + @Override + public StringData nonNullRead(ColumnVector vector, int row) { + BytesColumnVector bytesVector = (BytesColumnVector) vector; + return StringData.fromBytes( + bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); + } + } + + private static class DateReader implements OrcValueReader { + private static final DateReader INSTANCE = new DateReader(); + + @Override + public Integer nonNullRead(ColumnVector vector, int row) { + return (int) ((LongColumnVector) vector).vector[row]; + } + } + + private static class Decimal18Reader implements OrcValueReader { + private final int precision; + private final int scale; + + Decimal18Reader(int precision, int scale) { + this.precision = precision; + this.scale = scale; + } + + @Override + public DecimalData nonNullRead(ColumnVector vector, int row) { + HiveDecimalWritable value = ((DecimalColumnVector) vector).vector[row]; + + // The hive ORC writer may will adjust the scale of decimal data. + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); + + return DecimalData.fromUnscaledLong(value.serialize64(scale), precision, scale); + } + } + + private static class Decimal38Reader implements OrcValueReader { + private final int precision; + private final int scale; + + Decimal38Reader(int precision, int scale) { + this.precision = precision; + this.scale = scale; + } + + @Override + public DecimalData nonNullRead(ColumnVector vector, int row) { + BigDecimal value = + ((DecimalColumnVector) vector).vector[row].getHiveDecimal().bigDecimalValue(); + + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); + + return DecimalData.fromBigDecimal(value, precision, scale); + } + } + + private static class TimeReader implements OrcValueReader { + private static final TimeReader INSTANCE = new TimeReader(); + + @Override + public Integer nonNullRead(ColumnVector vector, int row) { + long micros = ((LongColumnVector) vector).vector[row]; + // Flink only support time mills, just erase micros. + return (int) (micros / 1000); + } + } + + private static class TimestampReader implements OrcValueReader { + private static final TimestampReader INSTANCE = new TimestampReader(); + + @Override + public TimestampData nonNullRead(ColumnVector vector, int row) { + TimestampColumnVector tcv = (TimestampColumnVector) vector; + LocalDateTime localDate = + Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) + .atOffset(ZoneOffset.UTC) + .toLocalDateTime(); + return TimestampData.fromLocalDateTime(localDate); + } + } + + private static class TimestampTzReader implements OrcValueReader { + private static final TimestampTzReader INSTANCE = new TimestampTzReader(); + + @Override + public TimestampData nonNullRead(ColumnVector vector, int row) { + TimestampColumnVector tcv = (TimestampColumnVector) vector; + Instant instant = + Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) + .atOffset(ZoneOffset.UTC) + .toInstant(); + return TimestampData.fromInstant(instant); + } + } + + private static class ArrayReader implements OrcValueReader { + private final OrcValueReader elementReader; + + private ArrayReader(OrcValueReader elementReader) { + this.elementReader = elementReader; + } + + @Override + public ArrayData nonNullRead(ColumnVector vector, int row) { + ListColumnVector listVector = (ListColumnVector) vector; + int offset = (int) listVector.offsets[row]; + int length = (int) listVector.lengths[row]; + List elements = Lists.newArrayListWithExpectedSize(length); + for (int c = 0; c < length; ++c) { + elements.add(elementReader.read(listVector.child, offset + c)); + } + return new GenericArrayData(elements.toArray()); + } + + @Override + public void setBatchContext(long batchOffsetInFile) { + elementReader.setBatchContext(batchOffsetInFile); + } + } + + private static class MapReader implements OrcValueReader { + private final OrcValueReader keyReader; + private final OrcValueReader valueReader; + + private MapReader(OrcValueReader keyReader, OrcValueReader valueReader) { + this.keyReader = keyReader; + this.valueReader = valueReader; + } + + @Override + public MapData nonNullRead(ColumnVector vector, int row) { + MapColumnVector mapVector = (MapColumnVector) vector; + int offset = (int) mapVector.offsets[row]; + long length = mapVector.lengths[row]; + + Map map = Maps.newHashMap(); + for (int c = 0; c < length; c++) { + K key = keyReader.read(mapVector.keys, offset + c); + V value = valueReader.read(mapVector.values, offset + c); + map.put(key, value); + } + + return new GenericMapData(map); + } + + @Override + public void setBatchContext(long batchOffsetInFile) { + keyReader.setBatchContext(batchOffsetInFile); + valueReader.setBatchContext(batchOffsetInFile); + } + } + + private static class StructReader extends OrcValueReaders.StructReader { + private final int numFields; + + StructReader( + List> readers, Types.StructType struct, Map idToConstant) { + super(readers, struct, idToConstant); + this.numFields = struct.fields().size(); + } + + @Override + protected RowData create() { + return new GenericRowData(numFields); + } + + @Override + protected void set(RowData struct, int pos, Object value) { + ((GenericRowData) struct).setField(pos, value); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java new file mode 100644 index 000000000000..6a31accffd22 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.util.Deque; +import java.util.List; +import java.util.stream.Stream; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FieldMetrics; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.orc.GenericOrcWriters; +import org.apache.iceberg.orc.OrcRowWriter; +import org.apache.iceberg.orc.OrcValueWriter; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; + +public class FlinkOrcWriter implements OrcRowWriter { + private final FlinkOrcWriters.RowDataWriter writer; + + private FlinkOrcWriter(RowType rowType, Schema iSchema) { + this.writer = + (FlinkOrcWriters.RowDataWriter) + FlinkSchemaVisitor.visit(rowType, iSchema, new WriteBuilder()); + } + + public static OrcRowWriter buildWriter(RowType rowType, Schema iSchema) { + return new FlinkOrcWriter(rowType, iSchema); + } + + @Override + public void write(RowData row, VectorizedRowBatch output) { + Preconditions.checkArgument(row != null, "value must not be null"); + writer.writeRow(row, output); + } + + @Override + public List> writers() { + return writer.writers(); + } + + @Override + public Stream> metrics() { + return writer.metrics(); + } + + private static class WriteBuilder extends FlinkSchemaVisitor> { + private final Deque fieldIds = Lists.newLinkedList(); + + private WriteBuilder() {} + + @Override + public void beforeField(Types.NestedField field) { + fieldIds.push(field.fieldId()); + } + + @Override + public void afterField(Types.NestedField field) { + fieldIds.pop(); + } + + @Override + public OrcValueWriter record( + Types.StructType iStruct, List> results, List fieldType) { + return FlinkOrcWriters.struct(results, fieldType); + } + + @Override + public OrcValueWriter map( + Types.MapType iMap, + OrcValueWriter key, + OrcValueWriter value, + LogicalType keyType, + LogicalType valueType) { + return FlinkOrcWriters.map(key, value, keyType, valueType); + } + + @Override + public OrcValueWriter list( + Types.ListType iList, OrcValueWriter element, LogicalType elementType) { + return FlinkOrcWriters.list(element, elementType); + } + + @Override + public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, LogicalType flinkPrimitive) { + switch (iPrimitive.typeId()) { + case BOOLEAN: + return GenericOrcWriters.booleans(); + case INTEGER: + switch (flinkPrimitive.getTypeRoot()) { + case TINYINT: + return GenericOrcWriters.bytes(); + case SMALLINT: + return GenericOrcWriters.shorts(); + } + return GenericOrcWriters.ints(); + case LONG: + return GenericOrcWriters.longs(); + case FLOAT: + Preconditions.checkArgument( + fieldIds.peek() != null, + String.format( + "[BUG] Cannot find field id for primitive field with type %s. This is likely because id " + + "information is not properly pushed during schema visiting.", + iPrimitive)); + return GenericOrcWriters.floats(fieldIds.peek()); + case DOUBLE: + Preconditions.checkArgument( + fieldIds.peek() != null, + String.format( + "[BUG] Cannot find field id for primitive field with type %s. This is likely because id " + + "information is not properly pushed during schema visiting.", + iPrimitive)); + return GenericOrcWriters.doubles(fieldIds.peek()); + case DATE: + return FlinkOrcWriters.dates(); + case TIME: + return FlinkOrcWriters.times(); + case TIMESTAMP: + Types.TimestampType timestampType = (Types.TimestampType) iPrimitive; + if (timestampType.shouldAdjustToUTC()) { + return FlinkOrcWriters.timestampTzs(); + } else { + return FlinkOrcWriters.timestamps(); + } + case STRING: + return FlinkOrcWriters.strings(); + case UUID: + case FIXED: + case BINARY: + return GenericOrcWriters.byteArrays(); + case DECIMAL: + Types.DecimalType decimalType = (Types.DecimalType) iPrimitive; + return FlinkOrcWriters.decimals(decimalType.precision(), decimalType.scale()); + default: + throw new IllegalArgumentException( + String.format( + "Invalid iceberg type %s corresponding to Flink logical type %s", + iPrimitive, flinkPrimitive)); + } + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java new file mode 100644 index 000000000000..afce2cda1db1 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java @@ -0,0 +1,318 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.time.Instant; +import java.time.OffsetDateTime; +import java.time.ZoneOffset; +import java.util.List; +import java.util.stream.Stream; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.iceberg.FieldMetrics; +import org.apache.iceberg.data.orc.GenericOrcWriters; +import org.apache.iceberg.flink.FlinkRowData; +import org.apache.iceberg.orc.OrcValueWriter; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.orc.storage.common.type.HiveDecimal; +import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; +import org.apache.orc.storage.ql.exec.vector.ColumnVector; +import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; +import org.apache.orc.storage.ql.exec.vector.ListColumnVector; +import org.apache.orc.storage.ql.exec.vector.LongColumnVector; +import org.apache.orc.storage.ql.exec.vector.MapColumnVector; +import org.apache.orc.storage.ql.exec.vector.TimestampColumnVector; + +class FlinkOrcWriters { + + private FlinkOrcWriters() {} + + static OrcValueWriter strings() { + return StringWriter.INSTANCE; + } + + static OrcValueWriter dates() { + return DateWriter.INSTANCE; + } + + static OrcValueWriter times() { + return TimeWriter.INSTANCE; + } + + static OrcValueWriter timestamps() { + return TimestampWriter.INSTANCE; + } + + static OrcValueWriter timestampTzs() { + return TimestampTzWriter.INSTANCE; + } + + static OrcValueWriter decimals(int precision, int scale) { + if (precision <= 18) { + return new Decimal18Writer(precision, scale); + } else if (precision <= 38) { + return new Decimal38Writer(precision, scale); + } else { + throw new IllegalArgumentException("Invalid precision: " + precision); + } + } + + static OrcValueWriter list( + OrcValueWriter elementWriter, LogicalType elementType) { + return new ListWriter<>(elementWriter, elementType); + } + + static OrcValueWriter map( + OrcValueWriter keyWriter, + OrcValueWriter valueWriter, + LogicalType keyType, + LogicalType valueType) { + return new MapWriter<>(keyWriter, valueWriter, keyType, valueType); + } + + static OrcValueWriter struct(List> writers, List types) { + return new RowDataWriter(writers, types); + } + + private static class StringWriter implements OrcValueWriter { + private static final StringWriter INSTANCE = new StringWriter(); + + @Override + public void nonNullWrite(int rowId, StringData data, ColumnVector output) { + byte[] value = data.toBytes(); + ((BytesColumnVector) output).setRef(rowId, value, 0, value.length); + } + } + + private static class DateWriter implements OrcValueWriter { + private static final DateWriter INSTANCE = new DateWriter(); + + @Override + public void nonNullWrite(int rowId, Integer data, ColumnVector output) { + ((LongColumnVector) output).vector[rowId] = data; + } + } + + private static class TimeWriter implements OrcValueWriter { + private static final TimeWriter INSTANCE = new TimeWriter(); + + @Override + public void nonNullWrite(int rowId, Integer millis, ColumnVector output) { + // The time in flink is in millisecond, while the standard time in iceberg is microsecond. + // So we need to transform it to microsecond. + ((LongColumnVector) output).vector[rowId] = millis * 1000L; + } + } + + private static class TimestampWriter implements OrcValueWriter { + private static final TimestampWriter INSTANCE = new TimestampWriter(); + + @Override + public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { + TimestampColumnVector cv = (TimestampColumnVector) output; + cv.setIsUTC(true); + // millis + OffsetDateTime offsetDateTime = data.toInstant().atOffset(ZoneOffset.UTC); + cv.time[rowId] = + offsetDateTime.toEpochSecond() * 1_000 + offsetDateTime.getNano() / 1_000_000; + // truncate nanos to only keep microsecond precision. + cv.nanos[rowId] = (offsetDateTime.getNano() / 1_000) * 1_000; + } + } + + private static class TimestampTzWriter implements OrcValueWriter { + private static final TimestampTzWriter INSTANCE = new TimestampTzWriter(); + + @SuppressWarnings("JavaInstantGetSecondsGetNano") + @Override + public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { + TimestampColumnVector cv = (TimestampColumnVector) output; + // millis + Instant instant = data.toInstant(); + cv.time[rowId] = instant.toEpochMilli(); + // truncate nanos to only keep microsecond precision. + cv.nanos[rowId] = (instant.getNano() / 1_000) * 1_000; + } + } + + private static class Decimal18Writer implements OrcValueWriter { + private final int precision; + private final int scale; + + Decimal18Writer(int precision, int scale) { + this.precision = precision; + this.scale = scale; + } + + @Override + public void nonNullWrite(int rowId, DecimalData data, ColumnVector output) { + Preconditions.checkArgument( + scale == data.scale(), + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + data); + Preconditions.checkArgument( + data.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + data); + + ((DecimalColumnVector) output) + .vector[rowId].setFromLongAndScale(data.toUnscaledLong(), data.scale()); + } + } + + private static class Decimal38Writer implements OrcValueWriter { + private final int precision; + private final int scale; + + Decimal38Writer(int precision, int scale) { + this.precision = precision; + this.scale = scale; + } + + @Override + public void nonNullWrite(int rowId, DecimalData data, ColumnVector output) { + Preconditions.checkArgument( + scale == data.scale(), + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + data); + Preconditions.checkArgument( + data.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + data); + + ((DecimalColumnVector) output) + .vector[rowId].set(HiveDecimal.create(data.toBigDecimal(), false)); + } + } + + static class ListWriter implements OrcValueWriter { + private final OrcValueWriter elementWriter; + private final ArrayData.ElementGetter elementGetter; + + ListWriter(OrcValueWriter elementWriter, LogicalType elementType) { + this.elementWriter = elementWriter; + this.elementGetter = ArrayData.createElementGetter(elementType); + } + + @Override + @SuppressWarnings("unchecked") + public void nonNullWrite(int rowId, ArrayData data, ColumnVector output) { + ListColumnVector cv = (ListColumnVector) output; + cv.lengths[rowId] = data.size(); + cv.offsets[rowId] = cv.childCount; + cv.childCount = (int) (cv.childCount + cv.lengths[rowId]); + // make sure the child is big enough. + growColumnVector(cv.child, cv.childCount); + + for (int e = 0; e < cv.lengths[rowId]; ++e) { + Object value = elementGetter.getElementOrNull(data, e); + elementWriter.write((int) (e + cv.offsets[rowId]), (T) value, cv.child); + } + } + + @Override + public Stream> metrics() { + return elementWriter.metrics(); + } + } + + static class MapWriter implements OrcValueWriter { + private final OrcValueWriter keyWriter; + private final OrcValueWriter valueWriter; + private final ArrayData.ElementGetter keyGetter; + private final ArrayData.ElementGetter valueGetter; + + MapWriter( + OrcValueWriter keyWriter, + OrcValueWriter valueWriter, + LogicalType keyType, + LogicalType valueType) { + this.keyWriter = keyWriter; + this.valueWriter = valueWriter; + this.keyGetter = ArrayData.createElementGetter(keyType); + this.valueGetter = ArrayData.createElementGetter(valueType); + } + + @Override + @SuppressWarnings("unchecked") + public void nonNullWrite(int rowId, MapData data, ColumnVector output) { + MapColumnVector cv = (MapColumnVector) output; + ArrayData keyArray = data.keyArray(); + ArrayData valArray = data.valueArray(); + + // record the length and start of the list elements + cv.lengths[rowId] = data.size(); + cv.offsets[rowId] = cv.childCount; + cv.childCount = (int) (cv.childCount + cv.lengths[rowId]); + // make sure the child is big enough + growColumnVector(cv.keys, cv.childCount); + growColumnVector(cv.values, cv.childCount); + // Add each element + for (int e = 0; e < cv.lengths[rowId]; ++e) { + int pos = (int) (e + cv.offsets[rowId]); + keyWriter.write(pos, (K) keyGetter.getElementOrNull(keyArray, e), cv.keys); + valueWriter.write(pos, (V) valueGetter.getElementOrNull(valArray, e), cv.values); + } + } + + @Override + public Stream> metrics() { + return Stream.concat(keyWriter.metrics(), valueWriter.metrics()); + } + } + + static class RowDataWriter extends GenericOrcWriters.StructWriter { + private final List fieldGetters; + + RowDataWriter(List> writers, List types) { + super(writers); + + this.fieldGetters = Lists.newArrayListWithExpectedSize(types.size()); + for (int i = 0; i < types.size(); i++) { + fieldGetters.add(FlinkRowData.createFieldGetter(types.get(i), i)); + } + } + + @Override + protected Object get(RowData struct, int index) { + return fieldGetters.get(index).getFieldOrNull(struct); + } + } + + private static void growColumnVector(ColumnVector cv, int requestedSize) { + if (cv.isNull.length < requestedSize) { + // Use growth factor of 3 to avoid frequent array allocations + cv.ensureSize(requestedSize * 3, true); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java new file mode 100644 index 000000000000..5c3581aef3ec --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java @@ -0,0 +1,860 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RawValueData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.iceberg.MetadataColumns; +import org.apache.iceberg.Schema; +import org.apache.iceberg.parquet.ParquetSchemaUtil; +import org.apache.iceberg.parquet.ParquetValueReader; +import org.apache.iceberg.parquet.ParquetValueReaders; +import org.apache.iceberg.parquet.TypeWithSchemaVisitor; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.ArrayUtil; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +public class FlinkParquetReaders { + private FlinkParquetReaders() {} + + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema) { + return buildReader(expectedSchema, fileSchema, ImmutableMap.of()); + } + + @SuppressWarnings("unchecked") + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema, Map idToConstant) { + return (ParquetValueReader) + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, idToConstant)); + } + + private static class ReadBuilder extends TypeWithSchemaVisitor> { + private final MessageType type; + private final Map idToConstant; + + ReadBuilder(MessageType type, Map idToConstant) { + this.type = type; + this.idToConstant = idToConstant; + } + + @Override + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { + return struct(expected, message.asGroupType(), fieldReaders); + } + + @Override + @SuppressWarnings("checkstyle:CyclomaticComplexity") + public ParquetValueReader struct( + Types.StructType expected, GroupType struct, List> fieldReaders) { + // match the expected struct's order + Map> readersById = Maps.newHashMap(); + Map typesById = Maps.newHashMap(); + Map maxDefinitionLevelsById = Maps.newHashMap(); + List fields = struct.getFields(); + for (int i = 0; i < fields.size(); i += 1) { + Type fieldType = fields.get(i); + if (fieldReaders.get(i) != null) { + int fieldD = type.getMaxDefinitionLevel(path(fieldType.getName())) - 1; + if (fieldType.getId() != null) { + int id = fieldType.getId().intValue(); + readersById.put(id, ParquetValueReaders.option(fieldType, fieldD, fieldReaders.get(i))); + typesById.put(id, fieldType); + if (idToConstant.containsKey(id)) { + maxDefinitionLevelsById.put(id, fieldD); + } + } + } + } + + List expectedFields = + expected != null ? expected.fields() : ImmutableList.of(); + List> reorderedFields = + Lists.newArrayListWithExpectedSize(expectedFields.size()); + // Defaulting to parent max definition level + int defaultMaxDefinitionLevel = type.getMaxDefinitionLevel(currentPath()); + for (Types.NestedField field : expectedFields) { + int id = field.fieldId(); + ParquetValueReader reader = readersById.get(id); + if (idToConstant.containsKey(id)) { + // containsKey is used because the constant may be null + int fieldMaxDefinitionLevel = + maxDefinitionLevelsById.getOrDefault(id, defaultMaxDefinitionLevel); + reorderedFields.add( + ParquetValueReaders.constant(idToConstant.get(id), fieldMaxDefinitionLevel)); + } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { + reorderedFields.add(ParquetValueReaders.position()); + } else if (id == MetadataColumns.IS_DELETED.fieldId()) { + reorderedFields.add(ParquetValueReaders.constant(false)); + } else if (reader != null) { + reorderedFields.add(reader); + } else if (field.initialDefault() != null) { + reorderedFields.add( + ParquetValueReaders.constant( + RowDataUtil.convertConstant(field.type(), field.initialDefault()), + maxDefinitionLevelsById.getOrDefault(id, defaultMaxDefinitionLevel))); + } else if (field.isOptional()) { + reorderedFields.add(ParquetValueReaders.nulls()); + } else { + throw new IllegalArgumentException( + String.format("Missing required field: %s", field.name())); + } + } + + return new RowDataReader(reorderedFields); + } + + @Override + public ParquetValueReader list( + Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { + if (expectedList == null) { + return null; + } + + String[] repeatedPath = currentPath(); + + int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; + int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; + + Type elementType = ParquetSchemaUtil.determineListElementType(array); + int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; + + return new ArrayReader<>( + repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); + } + + @Override + public ParquetValueReader map( + Types.MapType expectedMap, + GroupType map, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { + if (expectedMap == null) { + return null; + } + + GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); + String[] repeatedPath = currentPath(); + + int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; + int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; + + Type keyType = repeatedKeyValue.getType(0); + int keyD = type.getMaxDefinitionLevel(path(keyType.getName())) - 1; + Type valueType = repeatedKeyValue.getType(1); + int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; + + return new MapReader<>( + repeatedD, + repeatedR, + ParquetValueReaders.option(keyType, keyD, keyReader), + ParquetValueReaders.option(valueType, valueD, valueReader)); + } + + private static class LogicalTypeAnnotationParquetValueReaderVisitor + implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor> { + + private final PrimitiveType primitive; + private final ColumnDescriptor desc; + private final org.apache.iceberg.types.Type.PrimitiveType expected; + + LogicalTypeAnnotationParquetValueReaderVisitor( + PrimitiveType primitive, + ColumnDescriptor desc, + org.apache.iceberg.types.Type.PrimitiveType expected) { + this.primitive = primitive; + this.desc = desc; + this.expected = expected; + } + + @Override + public Optional> visit( + LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { + return Optional.of(new StringReader(desc)); + } + + @Override + public Optional> visit( + LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { + return Optional.of(new StringReader(desc)); + } + + @Override + public Optional> visit( + LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { + return Optional.of(new StringReader(desc)); + } + + @Override + public Optional> visit( + DecimalLogicalTypeAnnotation decimalLogicalType) { + switch (primitive.getPrimitiveTypeName()) { + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + return Optional.of( + new BinaryDecimalReader( + desc, decimalLogicalType.getPrecision(), decimalLogicalType.getScale())); + case INT64: + return Optional.of( + new LongDecimalReader( + desc, decimalLogicalType.getPrecision(), decimalLogicalType.getScale())); + case INT32: + return Optional.of( + new IntegerDecimalReader( + desc, decimalLogicalType.getPrecision(), decimalLogicalType.getScale())); + } + + return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit(decimalLogicalType); + } + + @Override + public Optional> visit( + LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { + return Optional.of(new ParquetValueReaders.UnboxedReader<>(desc)); + } + + @Override + public Optional> visit( + LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { + if (timeLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MILLIS) { + return Optional.of(new MillisTimeReader(desc)); + } else if (timeLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MICROS) { + return Optional.of(new LossyMicrosToMillisTimeReader(desc)); + } + + return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit(timeLogicalType); + } + + @Override + public Optional> visit( + LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { + if (timestampLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MILLIS) { + return Optional.of(new MillisToTimestampReader(desc)); + } else if (timestampLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MICROS) { + return Optional.of(new MicrosToTimestampReader(desc)); + } else if (timestampLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.NANOS) { + return Optional.of(new NanosToTimestampReader(desc)); + } + + return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit(timestampLogicalType); + } + + @Override + public Optional> visit( + LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { + int width = intLogicalType.getBitWidth(); + if (width <= 32) { + if (expected.typeId() == Types.LongType.get().typeId()) { + return Optional.of(new ParquetValueReaders.IntAsLongReader(desc)); + } else { + return Optional.of(new ParquetValueReaders.UnboxedReader<>(desc)); + } + } else if (width <= 64) { + return Optional.of(new ParquetValueReaders.UnboxedReader<>(desc)); + } + + return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit(intLogicalType); + } + + @Override + public Optional> visit( + LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { + return Optional.of(new ParquetValueReaders.ByteArrayReader(desc)); + } + } + + @Override + @SuppressWarnings("CyclomaticComplexity") + public ParquetValueReader primitive( + org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { + if (expected == null) { + return null; + } + + ColumnDescriptor desc = type.getColumnDescription(currentPath()); + LogicalTypeAnnotation logicalTypeAnnotation = primitive.getLogicalTypeAnnotation(); + if (logicalTypeAnnotation != null) { + return logicalTypeAnnotation + .accept(new LogicalTypeAnnotationParquetValueReaderVisitor(primitive, desc, expected)) + .orElseThrow( + () -> + new UnsupportedOperationException( + "Unsupported logical type: " + primitive.getLogicalTypeAnnotation())); + } + + switch (primitive.getPrimitiveTypeName()) { + case FIXED_LEN_BYTE_ARRAY: + case BINARY: + return new ParquetValueReaders.ByteArrayReader(desc); + case INT32: + if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.LONG) { + return new ParquetValueReaders.IntAsLongReader(desc); + } else { + return new ParquetValueReaders.UnboxedReader<>(desc); + } + case FLOAT: + if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.DOUBLE) { + return new ParquetValueReaders.FloatAsDoubleReader(desc); + } else { + return new ParquetValueReaders.UnboxedReader<>(desc); + } + case BOOLEAN: + case INT64: + case DOUBLE: + return new ParquetValueReaders.UnboxedReader<>(desc); + default: + throw new UnsupportedOperationException("Unsupported type: " + primitive); + } + } + } + + private static class BinaryDecimalReader + extends ParquetValueReaders.PrimitiveReader { + private final int precision; + private final int scale; + + BinaryDecimalReader(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + } + + @Override + public DecimalData read(DecimalData ignored) { + Binary binary = column.nextBinary(); + BigDecimal bigDecimal = new BigDecimal(new BigInteger(binary.getBytes()), scale); + // TODO: need a unit test to write-read-validate decimal via FlinkParquetWrite/Reader + return DecimalData.fromBigDecimal(bigDecimal, precision, scale); + } + } + + private static class IntegerDecimalReader + extends ParquetValueReaders.PrimitiveReader { + private final int precision; + private final int scale; + + IntegerDecimalReader(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + } + + @Override + public DecimalData read(DecimalData ignored) { + return DecimalData.fromUnscaledLong(column.nextInteger(), precision, scale); + } + } + + private static class LongDecimalReader extends ParquetValueReaders.PrimitiveReader { + private final int precision; + private final int scale; + + LongDecimalReader(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + } + + @Override + public DecimalData read(DecimalData ignored) { + return DecimalData.fromUnscaledLong(column.nextLong(), precision, scale); + } + } + + private static class NanosToTimestampReader + extends ParquetValueReaders.UnboxedReader { + NanosToTimestampReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public TimestampData read(TimestampData ignored) { + long value = readLong(); + return TimestampData.fromEpochMillis( + Math.floorDiv(value, 1_000_000L), Math.floorMod(value, 1_000_000)); + } + } + + private static class MicrosToTimestampReader + extends ParquetValueReaders.UnboxedReader { + MicrosToTimestampReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public TimestampData read(TimestampData ignored) { + long micros = readLong(); + return TimestampData.fromEpochMillis( + Math.floorDiv(micros, 1000L), Math.floorMod(micros, 1000) * 1000); + } + } + + private static class MillisToTimestampReader + extends ParquetValueReaders.UnboxedReader { + MillisToTimestampReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public TimestampData read(TimestampData ignored) { + long millis = readLong(); + return TimestampData.fromEpochMillis(millis); + } + } + + private static class StringReader extends ParquetValueReaders.PrimitiveReader { + StringReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public StringData read(StringData ignored) { + Binary binary = column.nextBinary(); + ByteBuffer buffer = binary.toByteBuffer(); + if (buffer.hasArray()) { + return StringData.fromBytes( + buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining()); + } else { + return StringData.fromBytes(binary.getBytes()); + } + } + } + + private static class LossyMicrosToMillisTimeReader + extends ParquetValueReaders.PrimitiveReader { + LossyMicrosToMillisTimeReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public Integer read(Integer reuse) { + // Discard microseconds since Flink uses millisecond unit for TIME type. + return (int) Math.floorDiv(column.nextLong(), 1000L); + } + } + + private static class MillisTimeReader extends ParquetValueReaders.PrimitiveReader { + MillisTimeReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public Integer read(Integer reuse) { + return (int) column.nextLong(); + } + } + + private static class ArrayReader + extends ParquetValueReaders.RepeatedReader { + private int readPos = 0; + private int writePos = 0; + + ArrayReader(int definitionLevel, int repetitionLevel, ParquetValueReader reader) { + super(definitionLevel, repetitionLevel, reader); + } + + @Override + protected ReusableArrayData newListData(ArrayData reuse) { + this.readPos = 0; + this.writePos = 0; + + if (reuse instanceof ReusableArrayData) { + return (ReusableArrayData) reuse; + } else { + return new ReusableArrayData(); + } + } + + @Override + @SuppressWarnings("unchecked") + protected E getElement(ReusableArrayData list) { + E value = null; + if (readPos < list.capacity()) { + value = (E) list.values[readPos]; + } + + readPos += 1; + + return value; + } + + @Override + protected void addElement(ReusableArrayData reused, E element) { + if (writePos >= reused.capacity()) { + reused.grow(); + } + + reused.values[writePos] = element; + + writePos += 1; + } + + @Override + protected ArrayData buildList(ReusableArrayData list) { + // Since ReusableArrayData is not accepted by Flink, use GenericArrayData temporarily to walk + // around it. + // Revert this to use ReusableArrayData once it is fixed in Flink. + // For your reference, https://issues.apache.org/jira/browse/FLINK-25238. + return new GenericArrayData(Arrays.copyOf(list.values, writePos)); + } + } + + private static class MapReader + extends ParquetValueReaders.RepeatedKeyValueReader { + private int readPos = 0; + private int writePos = 0; + + private final ParquetValueReaders.ReusableEntry entry = + new ParquetValueReaders.ReusableEntry<>(); + private final ParquetValueReaders.ReusableEntry nullEntry = + new ParquetValueReaders.ReusableEntry<>(); + + MapReader( + int definitionLevel, + int repetitionLevel, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { + super(definitionLevel, repetitionLevel, keyReader, valueReader); + } + + @Override + protected ReusableMapData newMapData(MapData reuse) { + this.readPos = 0; + this.writePos = 0; + + if (reuse instanceof ReusableMapData) { + return (ReusableMapData) reuse; + } else { + return new ReusableMapData(); + } + } + + @Override + @SuppressWarnings("unchecked") + protected Map.Entry getPair(ReusableMapData map) { + Map.Entry kv = nullEntry; + if (readPos < map.capacity()) { + entry.set((K) map.keys.values[readPos], (V) map.values.values[readPos]); + kv = entry; + } + + readPos += 1; + + return kv; + } + + @Override + protected void addPair(ReusableMapData map, K key, V value) { + if (writePos >= map.capacity()) { + map.grow(); + } + + map.keys.values[writePos] = key; + map.values.values[writePos] = value; + + writePos += 1; + } + + @Override + protected MapData buildMap(ReusableMapData map) { + map.setNumElements(writePos); + return map; + } + } + + private static class RowDataReader + extends ParquetValueReaders.StructReader { + private final int numFields; + + RowDataReader(List> readers) { + super(readers); + this.numFields = readers.size(); + } + + @Override + protected GenericRowData newStructData(RowData reuse) { + if (reuse instanceof GenericRowData) { + return (GenericRowData) reuse; + } else { + return new GenericRowData(numFields); + } + } + + @Override + protected Object getField(GenericRowData intermediate, int pos) { + return intermediate.getField(pos); + } + + @Override + protected RowData buildStruct(GenericRowData struct) { + return struct; + } + + @Override + protected void set(GenericRowData row, int pos, Object value) { + row.setField(pos, value); + } + + @Override + protected void setNull(GenericRowData row, int pos) { + row.setField(pos, null); + } + + @Override + protected void setBoolean(GenericRowData row, int pos, boolean value) { + row.setField(pos, value); + } + + @Override + protected void setInteger(GenericRowData row, int pos, int value) { + row.setField(pos, value); + } + + @Override + protected void setLong(GenericRowData row, int pos, long value) { + row.setField(pos, value); + } + + @Override + protected void setFloat(GenericRowData row, int pos, float value) { + row.setField(pos, value); + } + + @Override + protected void setDouble(GenericRowData row, int pos, double value) { + row.setField(pos, value); + } + } + + private static class ReusableMapData implements MapData { + private final ReusableArrayData keys; + private final ReusableArrayData values; + + private int numElements; + + private ReusableMapData() { + this.keys = new ReusableArrayData(); + this.values = new ReusableArrayData(); + } + + private void grow() { + keys.grow(); + values.grow(); + } + + private int capacity() { + return keys.capacity(); + } + + public void setNumElements(int numElements) { + this.numElements = numElements; + keys.setNumElements(numElements); + values.setNumElements(numElements); + } + + @Override + public int size() { + return numElements; + } + + @Override + public ReusableArrayData keyArray() { + return keys; + } + + @Override + public ReusableArrayData valueArray() { + return values; + } + } + + private static class ReusableArrayData implements ArrayData { + private static final Object[] EMPTY = new Object[0]; + + private Object[] values = EMPTY; + private int numElements = 0; + + private void grow() { + if (values.length == 0) { + this.values = new Object[20]; + } else { + Object[] old = values; + this.values = new Object[old.length << 1]; + // copy the old array in case it has values that can be reused + System.arraycopy(old, 0, values, 0, old.length); + } + } + + private int capacity() { + return values.length; + } + + public void setNumElements(int numElements) { + this.numElements = numElements; + } + + @Override + public int size() { + return numElements; + } + + @Override + public boolean isNullAt(int ordinal) { + return null == values[ordinal]; + } + + @Override + public boolean getBoolean(int ordinal) { + return (boolean) values[ordinal]; + } + + @Override + public byte getByte(int ordinal) { + return (byte) values[ordinal]; + } + + @Override + public short getShort(int ordinal) { + return (short) values[ordinal]; + } + + @Override + public int getInt(int ordinal) { + return (int) values[ordinal]; + } + + @Override + public long getLong(int ordinal) { + return (long) values[ordinal]; + } + + @Override + public float getFloat(int ordinal) { + return (float) values[ordinal]; + } + + @Override + public double getDouble(int ordinal) { + return (double) values[ordinal]; + } + + @Override + public StringData getString(int pos) { + return (StringData) values[pos]; + } + + @Override + public DecimalData getDecimal(int pos, int precision, int scale) { + return (DecimalData) values[pos]; + } + + @Override + public TimestampData getTimestamp(int pos, int precision) { + return (TimestampData) values[pos]; + } + + @SuppressWarnings("unchecked") + @Override + public RawValueData getRawValue(int pos) { + return (RawValueData) values[pos]; + } + + @Override + public byte[] getBinary(int ordinal) { + return (byte[]) values[ordinal]; + } + + @Override + public ArrayData getArray(int ordinal) { + return (ArrayData) values[ordinal]; + } + + @Override + public MapData getMap(int ordinal) { + return (MapData) values[ordinal]; + } + + @Override + public RowData getRow(int pos, int numFields) { + return (RowData) values[pos]; + } + + @Override + public boolean[] toBooleanArray() { + return ArrayUtil.toPrimitive((Boolean[]) values); + } + + @Override + public byte[] toByteArray() { + return ArrayUtil.toPrimitive((Byte[]) values); + } + + @Override + public short[] toShortArray() { + return ArrayUtil.toPrimitive((Short[]) values); + } + + @Override + public int[] toIntArray() { + return ArrayUtil.toPrimitive((Integer[]) values); + } + + @Override + public long[] toLongArray() { + return ArrayUtil.toPrimitive((Long[]) values); + } + + @Override + public float[] toFloatArray() { + return ArrayUtil.toPrimitive((Float[]) values); + } + + @Override + public double[] toDoubleArray() { + return ArrayUtil.toPrimitive((Double[]) values); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java new file mode 100644 index 000000000000..5c90252723bd --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java @@ -0,0 +1,608 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Optional; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.LogicalTypeRoot; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.RowType.RowField; +import org.apache.flink.table.types.logical.SmallIntType; +import org.apache.flink.table.types.logical.TinyIntType; +import org.apache.iceberg.flink.FlinkRowData; +import org.apache.iceberg.parquet.ParquetValueReaders; +import org.apache.iceberg.parquet.ParquetValueWriter; +import org.apache.iceberg.parquet.ParquetValueWriters; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.DecimalUtil; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.BsonLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.DateLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.EnumLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.IntLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.JsonLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.LogicalTypeAnnotationVisitor; +import org.apache.parquet.schema.LogicalTypeAnnotation.StringLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.TimeLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.TimestampLogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +public class FlinkParquetWriters { + private FlinkParquetWriters() {} + + @SuppressWarnings("unchecked") + public static ParquetValueWriter buildWriter(LogicalType schema, MessageType type) { + return (ParquetValueWriter) + ParquetWithFlinkSchemaVisitor.visit(schema, type, new WriteBuilder(type)); + } + + private static class WriteBuilder extends ParquetWithFlinkSchemaVisitor> { + private final MessageType type; + + WriteBuilder(MessageType type) { + this.type = type; + } + + @Override + public ParquetValueWriter message( + RowType sStruct, MessageType message, List> fields) { + return struct(sStruct, message.asGroupType(), fields); + } + + @Override + public ParquetValueWriter struct( + RowType sStruct, GroupType struct, List> fieldWriters) { + List flinkFields = sStruct.getFields(); + List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); + List flinkTypes = Lists.newArrayList(); + int[] fieldIndexes = new int[fieldWriters.size()]; + int fieldIndex = 0; + for (int i = 0; i < flinkFields.size(); i += 1) { + LogicalType flinkType = flinkFields.get(i).getType(); + if (!flinkType.is(LogicalTypeRoot.NULL)) { + writers.add(newOption(struct.getType(fieldIndex), fieldWriters.get(fieldIndex))); + flinkTypes.add(flinkType); + fieldIndexes[fieldIndex] = i; + fieldIndex += 1; + } + } + + return new RowDataWriter(fieldIndexes, writers, flinkTypes); + } + + @Override + public ParquetValueWriter list( + ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { + GroupType repeated = array.getFields().get(0).asGroupType(); + String[] repeatedPath = currentPath(); + + int repeatedD = type.getMaxDefinitionLevel(repeatedPath); + int repeatedR = type.getMaxRepetitionLevel(repeatedPath); + + return new ArrayDataWriter<>( + repeatedD, + repeatedR, + newOption(repeated.getType(0), elementWriter), + sArray.getElementType()); + } + + @Override + public ParquetValueWriter map( + MapType sMap, + GroupType map, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter) { + GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); + String[] repeatedPath = currentPath(); + + int repeatedD = type.getMaxDefinitionLevel(repeatedPath); + int repeatedR = type.getMaxRepetitionLevel(repeatedPath); + + return new MapDataWriter<>( + repeatedD, + repeatedR, + newOption(repeatedKeyValue.getType(0), keyWriter), + newOption(repeatedKeyValue.getType(1), valueWriter), + sMap.getKeyType(), + sMap.getValueType()); + } + + private ParquetValueWriter newOption(Type fieldType, ParquetValueWriter writer) { + int maxD = type.getMaxDefinitionLevel(path(fieldType.getName())); + return ParquetValueWriters.option(fieldType, maxD, writer); + } + + @Override + public ParquetValueWriter primitive(LogicalType fType, PrimitiveType primitive) { + ColumnDescriptor desc = type.getColumnDescription(currentPath()); + + LogicalTypeAnnotation annotation = primitive.getLogicalTypeAnnotation(); + if (annotation != null) { + Optional> writer = + annotation.accept(new LogicalTypeWriterBuilder(fType, desc)); + if (writer.isPresent()) { + return writer.get(); + } else { + throw new UnsupportedOperationException( + "Unsupported logical type: " + primitive.getOriginalType()); + } + } + + switch (primitive.getPrimitiveTypeName()) { + case FIXED_LEN_BYTE_ARRAY: + case BINARY: + return byteArrays(desc); + case BOOLEAN: + return ParquetValueWriters.booleans(desc); + case INT32: + return ints(fType, desc); + case INT64: + return ParquetValueWriters.longs(desc); + case FLOAT: + return ParquetValueWriters.floats(desc); + case DOUBLE: + return ParquetValueWriters.doubles(desc); + default: + throw new UnsupportedOperationException("Unsupported type: " + primitive); + } + } + } + + private static class LogicalTypeWriterBuilder + implements LogicalTypeAnnotationVisitor> { + private final LogicalType flinkType; + private final ColumnDescriptor desc; + + private LogicalTypeWriterBuilder(LogicalType flinkType, ColumnDescriptor desc) { + this.flinkType = flinkType; + this.desc = desc; + } + + @Override + public Optional> visit(StringLogicalTypeAnnotation strings) { + return Optional.of(strings(desc)); + } + + @Override + public Optional> visit(EnumLogicalTypeAnnotation enums) { + return Optional.of(strings(desc)); + } + + @Override + public Optional> visit(DecimalLogicalTypeAnnotation decimal) { + ParquetValueWriter writer; + switch (desc.getPrimitiveType().getPrimitiveTypeName()) { + case INT32: + writer = decimalAsInteger(desc, decimal.getPrecision(), decimal.getScale()); + break; + case INT64: + writer = decimalAsLong(desc, decimal.getPrecision(), decimal.getScale()); + break; + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + writer = decimalAsFixed(desc, decimal.getPrecision(), decimal.getScale()); + break; + default: + throw new UnsupportedOperationException( + "Unsupported base type for decimal: " + + desc.getPrimitiveType().getPrimitiveTypeName()); + } + return Optional.of(writer); + } + + @Override + public Optional> visit(DateLogicalTypeAnnotation dates) { + return Optional.of(ints(flinkType, desc)); + } + + @Override + public Optional> visit(TimeLogicalTypeAnnotation times) { + Preconditions.checkArgument( + LogicalTypeAnnotation.TimeUnit.MICROS.equals(times.getUnit()), + "Cannot write time in %s, only MICROS is supported", + times.getUnit()); + return Optional.of(timeMicros(desc)); + } + + @Override + public Optional> visit(TimestampLogicalTypeAnnotation timestamps) { + ParquetValueWriter writer; + switch (timestamps.getUnit()) { + case NANOS: + writer = timestampNanos(desc); + break; + case MICROS: + writer = timestamps(desc); + break; + default: + throw new UnsupportedOperationException("Unsupported timestamp type: " + timestamps); + } + + return Optional.of(writer); + } + + @Override + public Optional> visit(IntLogicalTypeAnnotation type) { + Preconditions.checkArgument(type.isSigned(), "Cannot write unsigned integer type: %s", type); + ParquetValueWriter writer; + if (type.getBitWidth() < 64) { + writer = ints(flinkType, desc); + } else { + writer = ParquetValueWriters.longs(desc); + } + + return Optional.of(writer); + } + + @Override + public Optional> visit(JsonLogicalTypeAnnotation ignored) { + return Optional.of(strings(desc)); + } + + @Override + public Optional> visit(BsonLogicalTypeAnnotation ignored) { + return Optional.of(byteArrays(desc)); + } + } + + private static ParquetValueWriter ints(LogicalType type, ColumnDescriptor desc) { + if (type instanceof TinyIntType) { + return ParquetValueWriters.tinyints(desc); + } else if (type instanceof SmallIntType) { + return ParquetValueWriters.shorts(desc); + } + return ParquetValueWriters.ints(desc); + } + + private static ParquetValueWriter strings(ColumnDescriptor desc) { + return new StringDataWriter(desc); + } + + private static ParquetValueWriter timeMicros(ColumnDescriptor desc) { + return new TimeMicrosWriter(desc); + } + + private static ParquetValueWriter decimalAsInteger( + ColumnDescriptor desc, int precision, int scale) { + Preconditions.checkArgument( + precision <= 9, + "Cannot write decimal value as integer with precision larger than 9," + + " wrong precision %s", + precision); + return new IntegerDecimalWriter(desc, precision, scale); + } + + private static ParquetValueWriter decimalAsLong( + ColumnDescriptor desc, int precision, int scale) { + Preconditions.checkArgument( + precision <= 18, + "Cannot write decimal value as long with precision larger than 18, " + + " wrong precision %s", + precision); + return new LongDecimalWriter(desc, precision, scale); + } + + private static ParquetValueWriter decimalAsFixed( + ColumnDescriptor desc, int precision, int scale) { + return new FixedDecimalWriter(desc, precision, scale); + } + + private static ParquetValueWriter timestamps(ColumnDescriptor desc) { + return new TimestampDataWriter(desc); + } + + private static ParquetValueWriter timestampNanos(ColumnDescriptor desc) { + return new TimestampNanoDataWriter(desc); + } + + private static ParquetValueWriter byteArrays(ColumnDescriptor desc) { + return new ByteArrayWriter(desc); + } + + private static class StringDataWriter extends ParquetValueWriters.PrimitiveWriter { + private StringDataWriter(ColumnDescriptor desc) { + super(desc); + } + + @Override + public void write(int repetitionLevel, StringData value) { + column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(value.toBytes())); + } + } + + private static class TimeMicrosWriter extends ParquetValueWriters.PrimitiveWriter { + private TimeMicrosWriter(ColumnDescriptor desc) { + super(desc); + } + + @Override + public void write(int repetitionLevel, Integer value) { + long micros = value.longValue() * 1000; + column.writeLong(repetitionLevel, micros); + } + } + + private static class IntegerDecimalWriter + extends ParquetValueWriters.PrimitiveWriter { + private final int precision; + private final int scale; + + private IntegerDecimalWriter(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + } + + @Override + public void write(int repetitionLevel, DecimalData decimal) { + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); + + column.writeInteger(repetitionLevel, (int) decimal.toUnscaledLong()); + } + } + + private static class LongDecimalWriter extends ParquetValueWriters.PrimitiveWriter { + private final int precision; + private final int scale; + + private LongDecimalWriter(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + } + + @Override + public void write(int repetitionLevel, DecimalData decimal) { + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); + + column.writeLong(repetitionLevel, decimal.toUnscaledLong()); + } + } + + private static class FixedDecimalWriter extends ParquetValueWriters.PrimitiveWriter { + private final int precision; + private final int scale; + private final ThreadLocal bytes; + + private FixedDecimalWriter(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + } + + @Override + public void write(int repetitionLevel, DecimalData decimal) { + byte[] binary = + DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal.toBigDecimal(), bytes.get()); + column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(binary)); + } + } + + private static class TimestampDataWriter + extends ParquetValueWriters.PrimitiveWriter { + private TimestampDataWriter(ColumnDescriptor desc) { + super(desc); + } + + @Override + public void write(int repetitionLevel, TimestampData value) { + column.writeLong( + repetitionLevel, value.getMillisecond() * 1000 + value.getNanoOfMillisecond() / 1000); + } + } + + private static class TimestampNanoDataWriter + extends ParquetValueWriters.PrimitiveWriter { + private TimestampNanoDataWriter(ColumnDescriptor desc) { + super(desc); + } + + @Override + public void write(int repetitionLevel, TimestampData value) { + column.writeLong( + repetitionLevel, value.getMillisecond() * 1_000_000L + value.getNanoOfMillisecond()); + } + } + + private static class ByteArrayWriter extends ParquetValueWriters.PrimitiveWriter { + private ByteArrayWriter(ColumnDescriptor desc) { + super(desc); + } + + @Override + public void write(int repetitionLevel, byte[] bytes) { + column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(bytes)); + } + } + + private static class ArrayDataWriter extends ParquetValueWriters.RepeatedWriter { + private final LogicalType elementType; + + private ArrayDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter writer, + LogicalType elementType) { + super(definitionLevel, repetitionLevel, writer); + this.elementType = elementType; + } + + @Override + protected Iterator elements(ArrayData list) { + return new ElementIterator<>(list); + } + + private class ElementIterator implements Iterator { + private final int size; + private final ArrayData list; + private final ArrayData.ElementGetter getter; + private int index; + + private ElementIterator(ArrayData list) { + this.list = list; + size = list.size(); + getter = ArrayData.createElementGetter(elementType); + index = 0; + } + + @Override + public boolean hasNext() { + return index != size; + } + + @Override + @SuppressWarnings("unchecked") + public E next() { + if (index >= size) { + throw new NoSuchElementException(); + } + + E element = (E) getter.getElementOrNull(list, index); + index += 1; + + return element; + } + } + } + + private static class MapDataWriter + extends ParquetValueWriters.RepeatedKeyValueWriter { + private final LogicalType keyType; + private final LogicalType valueType; + + private MapDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter, + LogicalType keyType, + LogicalType valueType) { + super(definitionLevel, repetitionLevel, keyWriter, valueWriter); + this.keyType = keyType; + this.valueType = valueType; + } + + @Override + protected Iterator> pairs(MapData map) { + return new EntryIterator<>(map); + } + + private class EntryIterator implements Iterator> { + private final int size; + private final ArrayData keys; + private final ArrayData values; + private final ParquetValueReaders.ReusableEntry entry; + private final ArrayData.ElementGetter keyGetter; + private final ArrayData.ElementGetter valueGetter; + private int index; + + private EntryIterator(MapData map) { + size = map.size(); + keys = map.keyArray(); + values = map.valueArray(); + entry = new ParquetValueReaders.ReusableEntry<>(); + keyGetter = ArrayData.createElementGetter(keyType); + valueGetter = ArrayData.createElementGetter(valueType); + index = 0; + } + + @Override + public boolean hasNext() { + return index != size; + } + + @Override + @SuppressWarnings("unchecked") + public Map.Entry next() { + if (index >= size) { + throw new NoSuchElementException(); + } + + entry.set( + (K) keyGetter.getElementOrNull(keys, index), + (V) valueGetter.getElementOrNull(values, index)); + index += 1; + + return entry; + } + } + } + + private static class RowDataWriter extends ParquetValueWriters.StructWriter { + private final RowData.FieldGetter[] fieldGetter; + + RowDataWriter( + int[] fieldIndexes, List> writers, List types) { + super(writers); + fieldGetter = new RowData.FieldGetter[types.size()]; + for (int i = 0; i < types.size(); i += 1) { + fieldGetter[i] = FlinkRowData.createFieldGetter(types.get(i), fieldIndexes[i]); + } + } + + @Override + protected Object get(RowData struct, int index) { + return fieldGetter[index].getFieldOrNull(struct); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkPlannedAvroReader.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkPlannedAvroReader.java new file mode 100644 index 000000000000..edc7041a4d04 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkPlannedAvroReader.java @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.function.Supplier; +import org.apache.avro.LogicalType; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.io.DatumReader; +import org.apache.avro.io.Decoder; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.avro.AvroWithPartnerVisitor; +import org.apache.iceberg.avro.SupportsRowPosition; +import org.apache.iceberg.avro.ValueReader; +import org.apache.iceberg.avro.ValueReaders; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.Pair; + +public class FlinkPlannedAvroReader implements DatumReader, SupportsRowPosition { + + private final Types.StructType expectedType; + private final Map idToConstant; + private ValueReader reader; + + public static FlinkPlannedAvroReader create(org.apache.iceberg.Schema schema) { + return create(schema, ImmutableMap.of()); + } + + public static FlinkPlannedAvroReader create( + org.apache.iceberg.Schema schema, Map constants) { + return new FlinkPlannedAvroReader(schema, constants); + } + + private FlinkPlannedAvroReader( + org.apache.iceberg.Schema expectedSchema, Map constants) { + this.expectedType = expectedSchema.asStruct(); + this.idToConstant = constants; + } + + @Override + @SuppressWarnings("unchecked") + public void setSchema(Schema fileSchema) { + this.reader = + (ValueReader) + AvroWithPartnerVisitor.visit( + expectedType, + fileSchema, + new ReadBuilder(idToConstant), + AvroWithPartnerVisitor.FieldIDAccessors.get()); + } + + @Override + public RowData read(RowData reuse, Decoder decoder) throws IOException { + return reader.read(decoder, reuse); + } + + @Override + public void setRowPositionSupplier(Supplier posSupplier) { + if (reader instanceof SupportsRowPosition) { + ((SupportsRowPosition) reader).setRowPositionSupplier(posSupplier); + } + } + + private static class ReadBuilder extends AvroWithPartnerVisitor> { + private final Map idToConstant; + + private ReadBuilder(Map idToConstant) { + this.idToConstant = idToConstant; + } + + @Override + public ValueReader record(Type partner, Schema record, List> fieldReaders) { + if (partner == null) { + return ValueReaders.skipStruct(fieldReaders); + } + + Types.StructType expected = partner.asStructType(); + List>> readPlan = + ValueReaders.buildReadPlan( + expected, record, fieldReaders, idToConstant, RowDataUtil::convertConstant); + + // TODO: should this pass expected so that struct.get can reuse containers? + return FlinkValueReaders.struct(readPlan, expected.fields().size()); + } + + @Override + public ValueReader union(Type partner, Schema union, List> options) { + return ValueReaders.union(options); + } + + @Override + public ValueReader array(Type partner, Schema array, ValueReader elementReader) { + return FlinkValueReaders.array(elementReader); + } + + @Override + public ValueReader arrayMap( + Type partner, Schema map, ValueReader keyReader, ValueReader valueReader) { + return FlinkValueReaders.arrayMap(keyReader, valueReader); + } + + @Override + public ValueReader map(Type partner, Schema map, ValueReader valueReader) { + return FlinkValueReaders.map(FlinkValueReaders.strings(), valueReader); + } + + @Override + public ValueReader primitive(Type partner, Schema primitive) { + LogicalType logicalType = primitive.getLogicalType(); + if (logicalType != null) { + switch (logicalType.getName()) { + case "date": + // Flink uses the same representation + return ValueReaders.ints(); + + case "time-micros": + return FlinkValueReaders.timeMicros(); + + case "timestamp-millis": + return FlinkValueReaders.timestampMills(); + + case "timestamp-micros": + return FlinkValueReaders.timestampMicros(); + + case "timestamp-nanos": + return FlinkValueReaders.timestampNanos(); + + case "decimal": + LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType; + return FlinkValueReaders.decimal( + ValueReaders.decimalBytesReader(primitive), + decimal.getPrecision(), + decimal.getScale()); + + case "uuid": + return FlinkValueReaders.uuids(); + + default: + throw new IllegalArgumentException("Unknown logical type: " + logicalType.getName()); + } + } + + switch (primitive.getType()) { + case NULL: + return ValueReaders.nulls(); + case BOOLEAN: + return ValueReaders.booleans(); + case INT: + if (partner != null && partner.typeId() == Type.TypeID.LONG) { + return ValueReaders.intsAsLongs(); + } + return ValueReaders.ints(); + case LONG: + return ValueReaders.longs(); + case FLOAT: + if (partner != null && partner.typeId() == Type.TypeID.DOUBLE) { + return ValueReaders.floatsAsDoubles(); + } + return ValueReaders.floats(); + case DOUBLE: + return ValueReaders.doubles(); + case STRING: + return FlinkValueReaders.strings(); + case FIXED: + return ValueReaders.fixed(primitive.getFixedSize()); + case BYTES: + return ValueReaders.bytes(); + case ENUM: + return FlinkValueReaders.enums(primitive.getEnumSymbols()); + default: + throw new IllegalArgumentException("Unsupported type: " + primitive); + } + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java new file mode 100644 index 000000000000..ba4e1a7a7aec --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.util.List; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; + +abstract class FlinkSchemaVisitor { + + static T visit(RowType flinkType, Schema schema, FlinkSchemaVisitor visitor) { + return visit(flinkType, schema.asStruct(), visitor); + } + + private static T visit(LogicalType flinkType, Type iType, FlinkSchemaVisitor visitor) { + switch (iType.typeId()) { + case STRUCT: + return visitRecord(flinkType, iType.asStructType(), visitor); + + case MAP: + MapType mapType = (MapType) flinkType; + Types.MapType iMapType = iType.asMapType(); + T key; + T value; + + Types.NestedField keyField = iMapType.field(iMapType.keyId()); + visitor.beforeMapKey(keyField); + try { + key = visit(mapType.getKeyType(), iMapType.keyType(), visitor); + } finally { + visitor.afterMapKey(keyField); + } + + Types.NestedField valueField = iMapType.field(iMapType.valueId()); + visitor.beforeMapValue(valueField); + try { + value = visit(mapType.getValueType(), iMapType.valueType(), visitor); + } finally { + visitor.afterMapValue(valueField); + } + + return visitor.map(iMapType, key, value, mapType.getKeyType(), mapType.getValueType()); + + case LIST: + ArrayType listType = (ArrayType) flinkType; + Types.ListType iListType = iType.asListType(); + T element; + + Types.NestedField elementField = iListType.field(iListType.elementId()); + visitor.beforeListElement(elementField); + try { + element = visit(listType.getElementType(), iListType.elementType(), visitor); + } finally { + visitor.afterListElement(elementField); + } + + return visitor.list(iListType, element, listType.getElementType()); + + default: + return visitor.primitive(iType.asPrimitiveType(), flinkType); + } + } + + private static T visitRecord( + LogicalType flinkType, Types.StructType struct, FlinkSchemaVisitor visitor) { + Preconditions.checkArgument(flinkType instanceof RowType, "%s is not a RowType.", flinkType); + RowType rowType = (RowType) flinkType; + + int fieldSize = struct.fields().size(); + List results = Lists.newArrayListWithExpectedSize(fieldSize); + List fieldTypes = Lists.newArrayListWithExpectedSize(fieldSize); + List nestedFields = struct.fields(); + + for (int i = 0; i < fieldSize; i++) { + Types.NestedField iField = nestedFields.get(i); + int fieldIndex = rowType.getFieldIndex(iField.name()); + Preconditions.checkArgument( + fieldIndex >= 0, "NestedField: %s is not found in flink RowType: %s", iField, rowType); + + LogicalType fieldFlinkType = rowType.getTypeAt(fieldIndex); + + fieldTypes.add(fieldFlinkType); + + visitor.beforeField(iField); + try { + results.add(visit(fieldFlinkType, iField.type(), visitor)); + } finally { + visitor.afterField(iField); + } + } + + return visitor.record(struct, results, fieldTypes); + } + + public T record(Types.StructType iStruct, List results, List fieldTypes) { + return null; + } + + public T list(Types.ListType iList, T element, LogicalType elementType) { + return null; + } + + public T map(Types.MapType iMap, T key, T value, LogicalType keyType, LogicalType valueType) { + return null; + } + + public T primitive(Type.PrimitiveType iPrimitive, LogicalType flinkPrimitive) { + return null; + } + + public void beforeField(Types.NestedField field) {} + + public void afterField(Types.NestedField field) {} + + public void beforeListElement(Types.NestedField elementField) { + beforeField(elementField); + } + + public void afterListElement(Types.NestedField elementField) { + afterField(elementField); + } + + public void beforeMapKey(Types.NestedField keyField) { + beforeField(keyField); + } + + public void afterMapKey(Types.NestedField keyField) { + afterField(keyField); + } + + public void beforeMapValue(Types.NestedField valueField) { + beforeField(valueField); + } + + public void afterMapValue(Types.NestedField valueField) { + afterField(valueField); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java new file mode 100644 index 000000000000..80b36d939ece --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java @@ -0,0 +1,356 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.Map; +import org.apache.avro.io.Decoder; +import org.apache.avro.util.Utf8; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.iceberg.avro.ValueReader; +import org.apache.iceberg.avro.ValueReaders; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.Pair; + +public class FlinkValueReaders { + + private FlinkValueReaders() {} + + static ValueReader strings() { + return StringReader.INSTANCE; + } + + static ValueReader enums(List symbols) { + return new EnumReader(symbols); + } + + static ValueReader uuids() { + return ValueReaders.fixed(16); + } + + static ValueReader timeMicros() { + return TimeMicrosReader.INSTANCE; + } + + static ValueReader timestampMills() { + return TimestampMillsReader.INSTANCE; + } + + static ValueReader timestampMicros() { + return TimestampMicrosReader.INSTANCE; + } + + static ValueReader timestampNanos() { + return TimestampNanosReader.INSTANCE; + } + + static ValueReader decimal( + ValueReader unscaledReader, int precision, int scale) { + return new DecimalReader(unscaledReader, precision, scale); + } + + static ValueReader array(ValueReader elementReader) { + return new ArrayReader(elementReader); + } + + static ValueReader arrayMap(ValueReader keyReader, ValueReader valueReader) { + return new ArrayMapReader(keyReader, valueReader); + } + + static ValueReader map(ValueReader keyReader, ValueReader valueReader) { + return new MapReader(keyReader, valueReader); + } + + static ValueReader struct(List>> readPlan, int numFields) { + return new PlannedStructReader(readPlan, numFields); + } + + static ValueReader struct( + List> readers, Types.StructType struct, Map idToConstant) { + return new StructReader(readers, struct, idToConstant); + } + + private static class StringReader implements ValueReader { + private static final StringReader INSTANCE = new StringReader(); + + private StringReader() {} + + @Override + public StringData read(Decoder decoder, Object reuse) throws IOException { + // use the decoder's readString(Utf8) method because it may be a resolving decoder + Utf8 utf8 = null; + if (reuse instanceof StringData) { + utf8 = new Utf8(((StringData) reuse).toBytes()); + } + + Utf8 string = decoder.readString(utf8); + return StringData.fromBytes(string.getBytes(), 0, string.getByteLength()); + } + } + + private static class EnumReader implements ValueReader { + private final StringData[] symbols; + + private EnumReader(List symbols) { + this.symbols = new StringData[symbols.size()]; + for (int i = 0; i < this.symbols.length; i += 1) { + this.symbols[i] = StringData.fromBytes(symbols.get(i).getBytes(StandardCharsets.UTF_8)); + } + } + + @Override + public StringData read(Decoder decoder, Object ignore) throws IOException { + int index = decoder.readEnum(); + return symbols[index]; + } + } + + private static class DecimalReader implements ValueReader { + private final ValueReader bytesReader; + private final int precision; + private final int scale; + + private DecimalReader(ValueReader bytesReader, int precision, int scale) { + this.bytesReader = bytesReader; + this.precision = precision; + this.scale = scale; + } + + @Override + public DecimalData read(Decoder decoder, Object reuse) throws IOException { + byte[] bytes = bytesReader.read(decoder, null); + return DecimalData.fromBigDecimal( + new BigDecimal(new BigInteger(bytes), scale), precision, scale); + } + } + + private static class TimeMicrosReader implements ValueReader { + private static final TimeMicrosReader INSTANCE = new TimeMicrosReader(); + + @Override + public Integer read(Decoder decoder, Object reuse) throws IOException { + long micros = decoder.readLong(); + // Flink only support time mills, just erase micros. + return (int) (micros / 1000); + } + } + + private static class TimestampMillsReader implements ValueReader { + private static final TimestampMillsReader INSTANCE = new TimestampMillsReader(); + + @Override + public TimestampData read(Decoder decoder, Object reuse) throws IOException { + return TimestampData.fromEpochMillis(decoder.readLong()); + } + } + + private static class TimestampMicrosReader implements ValueReader { + private static final TimestampMicrosReader INSTANCE = new TimestampMicrosReader(); + + @Override + public TimestampData read(Decoder decoder, Object reuse) throws IOException { + long micros = decoder.readLong(); + long mills = Math.floorDiv(micros, 1000); + int nanos = Math.floorMod(micros, 1000) * 1000; + return TimestampData.fromEpochMillis(mills, nanos); + } + } + + private static class TimestampNanosReader implements ValueReader { + private static final TimestampNanosReader INSTANCE = new TimestampNanosReader(); + + @Override + public TimestampData read(Decoder decoder, Object reuse) throws IOException { + long nanos = decoder.readLong(); + long mills = Math.floorDiv(nanos, 1_000_000); + int leftover = Math.floorMod(nanos, 1_000_000); + return TimestampData.fromEpochMillis(mills, leftover); + } + } + + private static class ArrayReader implements ValueReader { + private final ValueReader elementReader; + private final List reusedList = Lists.newArrayList(); + + private ArrayReader(ValueReader elementReader) { + this.elementReader = elementReader; + } + + @Override + public GenericArrayData read(Decoder decoder, Object reuse) throws IOException { + reusedList.clear(); + long chunkLength = decoder.readArrayStart(); + + while (chunkLength > 0) { + for (int i = 0; i < chunkLength; i += 1) { + reusedList.add(elementReader.read(decoder, null)); + } + + chunkLength = decoder.arrayNext(); + } + + // this will convert the list to an array so it is okay to reuse the list + return new GenericArrayData(reusedList.toArray()); + } + } + + private static MapData kvArrayToMap(List keyList, List valueList) { + Map map = Maps.newHashMap(); + Object[] keys = keyList.toArray(); + Object[] values = valueList.toArray(); + for (int i = 0; i < keys.length; i++) { + map.put(keys[i], values[i]); + } + + return new GenericMapData(map); + } + + private static class ArrayMapReader implements ValueReader { + private final ValueReader keyReader; + private final ValueReader valueReader; + + private final List reusedKeyList = Lists.newArrayList(); + private final List reusedValueList = Lists.newArrayList(); + + private ArrayMapReader(ValueReader keyReader, ValueReader valueReader) { + this.keyReader = keyReader; + this.valueReader = valueReader; + } + + @Override + public MapData read(Decoder decoder, Object reuse) throws IOException { + reusedKeyList.clear(); + reusedValueList.clear(); + + long chunkLength = decoder.readArrayStart(); + + while (chunkLength > 0) { + for (int i = 0; i < chunkLength; i += 1) { + reusedKeyList.add(keyReader.read(decoder, null)); + reusedValueList.add(valueReader.read(decoder, null)); + } + + chunkLength = decoder.arrayNext(); + } + + return kvArrayToMap(reusedKeyList, reusedValueList); + } + } + + private static class MapReader implements ValueReader { + private final ValueReader keyReader; + private final ValueReader valueReader; + + private final List reusedKeyList = Lists.newArrayList(); + private final List reusedValueList = Lists.newArrayList(); + + private MapReader(ValueReader keyReader, ValueReader valueReader) { + this.keyReader = keyReader; + this.valueReader = valueReader; + } + + @Override + public MapData read(Decoder decoder, Object reuse) throws IOException { + reusedKeyList.clear(); + reusedValueList.clear(); + + long chunkLength = decoder.readMapStart(); + + while (chunkLength > 0) { + for (int i = 0; i < chunkLength; i += 1) { + reusedKeyList.add(keyReader.read(decoder, null)); + reusedValueList.add(valueReader.read(decoder, null)); + } + + chunkLength = decoder.mapNext(); + } + + return kvArrayToMap(reusedKeyList, reusedValueList); + } + } + + private static class PlannedStructReader extends ValueReaders.PlannedStructReader { + private final int numFields; + + private PlannedStructReader(List>> readPlan, int numFields) { + super(readPlan); + this.numFields = numFields; + } + + @Override + protected RowData reuseOrCreate(Object reuse) { + if (reuse instanceof GenericRowData && ((GenericRowData) reuse).getArity() == numFields) { + return (RowData) reuse; + } + return new GenericRowData(numFields); + } + + @Override + protected Object get(RowData struct, int pos) { + return null; + } + + @Override + protected void set(RowData struct, int pos, Object value) { + ((GenericRowData) struct).setField(pos, value); + } + } + + private static class StructReader extends ValueReaders.StructReader { + private final int numFields; + + private StructReader( + List> readers, Types.StructType struct, Map idToConstant) { + super(readers, struct, idToConstant); + this.numFields = readers.size(); + } + + @Override + protected RowData reuseOrCreate(Object reuse) { + if (reuse instanceof GenericRowData && ((GenericRowData) reuse).getArity() == numFields) { + return (GenericRowData) reuse; + } + return new GenericRowData(numFields); + } + + @Override + protected Object get(RowData struct, int pos) { + return null; + } + + @Override + protected void set(RowData struct, int pos, Object value) { + ((GenericRowData) struct).setField(pos, value); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java new file mode 100644 index 000000000000..f87e63704965 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java @@ -0,0 +1,269 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.io.IOException; +import java.lang.reflect.Array; +import java.util.List; +import org.apache.avro.io.Encoder; +import org.apache.avro.util.Utf8; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.iceberg.avro.ValueWriter; +import org.apache.iceberg.flink.FlinkRowData; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.DecimalUtil; + +public class FlinkValueWriters { + + private FlinkValueWriters() {} + + static ValueWriter strings() { + return StringWriter.INSTANCE; + } + + static ValueWriter timeMicros() { + return TimeMicrosWriter.INSTANCE; + } + + static ValueWriter timestampMicros() { + return TimestampMicrosWriter.INSTANCE; + } + + static ValueWriter timestampNanos() { + return TimestampNanosWriter.INSTANCE; + } + + static ValueWriter decimal(int precision, int scale) { + return new DecimalWriter(precision, scale); + } + + static ValueWriter array(ValueWriter elementWriter, LogicalType elementType) { + return new ArrayWriter<>(elementWriter, elementType); + } + + static ValueWriter arrayMap( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { + return new ArrayMapWriter<>(keyWriter, keyType, valueWriter, valueType); + } + + static ValueWriter map( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { + return new MapWriter<>(keyWriter, keyType, valueWriter, valueType); + } + + static ValueWriter row(List> writers, List types) { + return new RowWriter(writers, types); + } + + private static class StringWriter implements ValueWriter { + private static final StringWriter INSTANCE = new StringWriter(); + + private StringWriter() {} + + @Override + public void write(StringData s, Encoder encoder) throws IOException { + // toBytes is cheaper than Avro calling toString, which incurs encoding costs + encoder.writeString(new Utf8(s.toBytes())); + } + } + + private static class DecimalWriter implements ValueWriter { + private final int precision; + private final int scale; + private final ThreadLocal bytes; + + private DecimalWriter(int precision, int scale) { + this.precision = precision; + this.scale = scale; + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + } + + @Override + public void write(DecimalData d, Encoder encoder) throws IOException { + encoder.writeFixed( + DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toBigDecimal(), bytes.get())); + } + } + + private static class TimeMicrosWriter implements ValueWriter { + private static final TimeMicrosWriter INSTANCE = new TimeMicrosWriter(); + + @Override + public void write(Integer timeMills, Encoder encoder) throws IOException { + encoder.writeLong(timeMills * 1000L); + } + } + + private static class TimestampMicrosWriter implements ValueWriter { + private static final TimestampMicrosWriter INSTANCE = new TimestampMicrosWriter(); + + @Override + public void write(TimestampData timestampData, Encoder encoder) throws IOException { + long micros = + timestampData.getMillisecond() * 1000 + timestampData.getNanoOfMillisecond() / 1000; + encoder.writeLong(micros); + } + } + + private static class TimestampNanosWriter implements ValueWriter { + private static final TimestampNanosWriter INSTANCE = new TimestampNanosWriter(); + + @Override + public void write(TimestampData timestampData, Encoder encoder) throws IOException { + long nanos = + timestampData.getMillisecond() * 1_000_000 + timestampData.getNanoOfMillisecond(); + encoder.writeLong(nanos); + } + } + + private static class ArrayWriter implements ValueWriter { + private final ValueWriter elementWriter; + private final ArrayData.ElementGetter elementGetter; + + private ArrayWriter(ValueWriter elementWriter, LogicalType elementType) { + this.elementWriter = elementWriter; + this.elementGetter = ArrayData.createElementGetter(elementType); + } + + @Override + @SuppressWarnings("unchecked") + public void write(ArrayData array, Encoder encoder) throws IOException { + encoder.writeArrayStart(); + int numElements = array.size(); + encoder.setItemCount(numElements); + for (int i = 0; i < numElements; i += 1) { + encoder.startItem(); + elementWriter.write((T) elementGetter.getElementOrNull(array, i), encoder); + } + encoder.writeArrayEnd(); + } + } + + private static class ArrayMapWriter implements ValueWriter { + private final ValueWriter keyWriter; + private final ValueWriter valueWriter; + private final ArrayData.ElementGetter keyGetter; + private final ArrayData.ElementGetter valueGetter; + + private ArrayMapWriter( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { + this.keyWriter = keyWriter; + this.keyGetter = ArrayData.createElementGetter(keyType); + this.valueWriter = valueWriter; + this.valueGetter = ArrayData.createElementGetter(valueType); + } + + @Override + @SuppressWarnings("unchecked") + public void write(MapData map, Encoder encoder) throws IOException { + encoder.writeArrayStart(); + int numElements = map.size(); + encoder.setItemCount(numElements); + ArrayData keyArray = map.keyArray(); + ArrayData valueArray = map.valueArray(); + for (int i = 0; i < numElements; i += 1) { + encoder.startItem(); + keyWriter.write((K) keyGetter.getElementOrNull(keyArray, i), encoder); + valueWriter.write((V) valueGetter.getElementOrNull(valueArray, i), encoder); + } + encoder.writeArrayEnd(); + } + } + + private static class MapWriter implements ValueWriter { + private final ValueWriter keyWriter; + private final ValueWriter valueWriter; + private final ArrayData.ElementGetter keyGetter; + private final ArrayData.ElementGetter valueGetter; + + private MapWriter( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { + this.keyWriter = keyWriter; + this.keyGetter = ArrayData.createElementGetter(keyType); + this.valueWriter = valueWriter; + this.valueGetter = ArrayData.createElementGetter(valueType); + } + + @Override + @SuppressWarnings("unchecked") + public void write(MapData map, Encoder encoder) throws IOException { + encoder.writeMapStart(); + int numElements = map.size(); + encoder.setItemCount(numElements); + ArrayData keyArray = map.keyArray(); + ArrayData valueArray = map.valueArray(); + for (int i = 0; i < numElements; i += 1) { + encoder.startItem(); + keyWriter.write((K) keyGetter.getElementOrNull(keyArray, i), encoder); + valueWriter.write((V) valueGetter.getElementOrNull(valueArray, i), encoder); + } + encoder.writeMapEnd(); + } + } + + static class RowWriter implements ValueWriter { + private final ValueWriter[] writers; + private final RowData.FieldGetter[] getters; + + private RowWriter(List> writers, List types) { + this.writers = (ValueWriter[]) Array.newInstance(ValueWriter.class, writers.size()); + this.getters = new RowData.FieldGetter[writers.size()]; + for (int i = 0; i < writers.size(); i += 1) { + this.writers[i] = writers.get(i); + this.getters[i] = FlinkRowData.createFieldGetter(types.get(i), i); + } + } + + @Override + public void write(RowData row, Encoder encoder) throws IOException { + for (int i = 0; i < writers.length; i += 1) { + if (row.isNullAt(i)) { + writers[i].write(null, encoder); + } else { + write(row, i, writers[i], encoder); + } + } + } + + @SuppressWarnings("unchecked") + private void write(RowData row, int pos, ValueWriter writer, Encoder encoder) + throws IOException { + writer.write((T) getters[pos].getFieldOrNull(row), encoder); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java new file mode 100644 index 000000000000..39aac237a8f6 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.util.Deque; +import java.util.List; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.LogicalTypeRoot; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.RowType.RowField; +import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +public class ParquetWithFlinkSchemaVisitor { + private final Deque fieldNames = Lists.newLinkedList(); + + public static T visit( + LogicalType sType, Type type, ParquetWithFlinkSchemaVisitor visitor) { + Preconditions.checkArgument(sType != null, "Invalid DataType: null"); + if (type instanceof MessageType) { + Preconditions.checkArgument( + sType instanceof RowType, "Invalid struct: %s is not a struct", sType); + RowType struct = (RowType) sType; + return visitor.message( + struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); + } else if (type.isPrimitive()) { + return visitor.primitive(sType, type.asPrimitiveType()); + } else { + // if not a primitive, the typeId must be a group + GroupType group = type.asGroupType(); + LogicalTypeAnnotation annotation = group.getLogicalTypeAnnotation(); + if (annotation instanceof LogicalTypeAnnotation.ListLogicalTypeAnnotation) { + Preconditions.checkArgument( + !group.isRepetition(Type.Repetition.REPEATED), + "Invalid list: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid list: does not contain single repeated field: %s", + group); + + GroupType repeatedElement = group.getFields().get(0).asGroupType(); + Preconditions.checkArgument( + repeatedElement.isRepetition(Type.Repetition.REPEATED), + "Invalid list: inner group is not repeated"); + Preconditions.checkArgument( + repeatedElement.getFieldCount() <= 1, + "Invalid list: repeated group is not a single field: %s", + group); + + Preconditions.checkArgument( + sType instanceof ArrayType, "Invalid list: %s is not an array", sType); + ArrayType array = (ArrayType) sType; + RowField element = + new RowField( + "element", array.getElementType(), "element of " + array.asSummaryString()); + + visitor.fieldNames.push(repeatedElement.getName()); + try { + T elementResult = null; + if (repeatedElement.getFieldCount() > 0) { + elementResult = visitField(element, repeatedElement.getType(0), visitor); + } + + return visitor.list(array, group, elementResult); + + } finally { + visitor.fieldNames.pop(); + } + } else if (annotation instanceof LogicalTypeAnnotation.MapLogicalTypeAnnotation) { + Preconditions.checkArgument( + !group.isRepetition(Type.Repetition.REPEATED), + "Invalid map: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid map: does not contain single repeated field: %s", + group); + + GroupType repeatedKeyValue = group.getType(0).asGroupType(); + Preconditions.checkArgument( + repeatedKeyValue.isRepetition(Type.Repetition.REPEATED), + "Invalid map: inner group is not repeated"); + Preconditions.checkArgument( + repeatedKeyValue.getFieldCount() <= 2, + "Invalid map: repeated group does not have 2 fields"); + + Preconditions.checkArgument( + sType instanceof MapType, "Invalid map: %s is not a map", sType); + MapType map = (MapType) sType; + RowField keyField = + new RowField("key", map.getKeyType(), "key of " + map.asSummaryString()); + RowField valueField = + new RowField("value", map.getValueType(), "value of " + map.asSummaryString()); + + visitor.fieldNames.push(repeatedKeyValue.getName()); + try { + T keyResult = null; + T valueResult = null; + switch (repeatedKeyValue.getFieldCount()) { + case 2: + // if there are 2 fields, both key and value are projected + keyResult = visitField(keyField, repeatedKeyValue.getType(0), visitor); + valueResult = visitField(valueField, repeatedKeyValue.getType(1), visitor); + break; + case 1: + // if there is just one, use the name to determine what it is + Type keyOrValue = repeatedKeyValue.getType(0); + if (keyOrValue.getName().equalsIgnoreCase("key")) { + keyResult = visitField(keyField, keyOrValue, visitor); + // value result remains null + } else { + valueResult = visitField(valueField, keyOrValue, visitor); + // key result remains null + } + break; + default: + // both results will remain null + } + + return visitor.map(map, group, keyResult, valueResult); + + } finally { + visitor.fieldNames.pop(); + } + } + Preconditions.checkArgument( + sType instanceof RowType, "Invalid struct: %s is not a struct", sType); + RowType struct = (RowType) sType; + return visitor.struct(struct, group, visitFields(struct, group, visitor)); + } + } + + private static T visitField( + RowType.RowField sField, Type field, ParquetWithFlinkSchemaVisitor visitor) { + visitor.fieldNames.push(field.getName()); + try { + return visit(sField.getType(), field, visitor); + } finally { + visitor.fieldNames.pop(); + } + } + + private static List visitFields( + RowType struct, GroupType group, ParquetWithFlinkSchemaVisitor visitor) { + List sFields = struct.getFields(); + List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); + + int pos = 0; + for (RowField sField : sFields) { + if (sField.getType().getTypeRoot() == LogicalTypeRoot.NULL) { + // skip null types that are not in the Parquet schema + continue; + } + + Type field = group.getFields().get(pos); + Preconditions.checkArgument( + field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.getName())), + "Structs do not match: field %s != %s", + field.getName(), + sField.getName()); + results.add(visitField(sField, field, visitor)); + + pos += 1; + } + + return results; + } + + public T message(RowType sStruct, MessageType message, List fields) { + return null; + } + + public T struct(RowType sStruct, GroupType struct, List fields) { + return null; + } + + public T list(ArrayType sArray, GroupType array, T element) { + return null; + } + + public T map(MapType sMap, GroupType map, T key, T value) { + return null; + } + + public T primitive(LogicalType sPrimitive, PrimitiveType primitive) { + return null; + } + + protected String[] currentPath() { + return Lists.newArrayList(fieldNames.descendingIterator()).toArray(new String[0]); + } + + protected String[] path(String name) { + List list = Lists.newArrayList(fieldNames.descendingIterator()); + list.add(name); + return list.toArray(new String[0]); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java new file mode 100644 index 000000000000..9395b0e4810e --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java @@ -0,0 +1,342 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.util.Arrays; +import java.util.Map; +import java.util.Objects; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RawValueData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.RowKind; +import org.apache.flink.util.StringUtils; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkRowData; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; + +public class RowDataProjection implements RowData { + /** + * Creates a projecting wrapper for {@link RowData} rows. + * + *

This projection will not project the nested children types of repeated types like lists and + * maps. + * + * @param schema schema of rows wrapped by this projection + * @param projectedSchema result schema of the projected rows + * @return a wrapper to project rows + */ + public static RowDataProjection create(Schema schema, Schema projectedSchema) { + return RowDataProjection.create( + FlinkSchemaUtil.convert(schema), schema.asStruct(), projectedSchema.asStruct()); + } + + /** + * Creates a projecting wrapper for {@link RowData} rows. + * + *

This projection will not project the nested children types of repeated types like lists and + * maps. + * + * @param rowType flink row type of rows wrapped by this projection + * @param schema schema of rows wrapped by this projection + * @param projectedSchema result schema of the projected rows + * @return a wrapper to project rows + */ + public static RowDataProjection create( + RowType rowType, Types.StructType schema, Types.StructType projectedSchema) { + return new RowDataProjection(rowType, schema, projectedSchema); + } + + private final RowData.FieldGetter[] getters; + private RowData rowData; + + private RowDataProjection( + RowType rowType, Types.StructType rowStruct, Types.StructType projectType) { + Map fieldIdToPosition = Maps.newHashMap(); + for (int i = 0; i < rowStruct.fields().size(); i++) { + fieldIdToPosition.put(rowStruct.fields().get(i).fieldId(), i); + } + + this.getters = new RowData.FieldGetter[projectType.fields().size()]; + for (int i = 0; i < getters.length; i++) { + Types.NestedField projectField = projectType.fields().get(i); + Types.NestedField rowField = rowStruct.field(projectField.fieldId()); + + Preconditions.checkNotNull( + rowField, + "Cannot locate the project field <%s> in the iceberg struct <%s>", + projectField, + rowStruct); + + getters[i] = + createFieldGetter( + rowType, fieldIdToPosition.get(projectField.fieldId()), rowField, projectField); + } + } + + private static RowData.FieldGetter createFieldGetter( + RowType rowType, int position, Types.NestedField rowField, Types.NestedField projectField) { + Preconditions.checkArgument( + rowField.type().typeId() == projectField.type().typeId(), + "Different iceberg type between row field <%s> and project field <%s>", + rowField, + projectField); + + switch (projectField.type().typeId()) { + case STRUCT: + RowType nestedRowType = (RowType) rowType.getTypeAt(position); + return row -> { + // null nested struct value + if (row.isNullAt(position)) { + return null; + } + + RowData nestedRow = row.getRow(position, nestedRowType.getFieldCount()); + return RowDataProjection.create( + nestedRowType, rowField.type().asStructType(), projectField.type().asStructType()) + .wrap(nestedRow); + }; + + case MAP: + Types.MapType projectedMap = projectField.type().asMapType(); + Types.MapType originalMap = rowField.type().asMapType(); + + boolean keyProjectable = + !projectedMap.keyType().isNestedType() + || projectedMap.keyType().equals(originalMap.keyType()); + boolean valueProjectable = + !projectedMap.valueType().isNestedType() + || projectedMap.valueType().equals(originalMap.valueType()); + Preconditions.checkArgument( + keyProjectable && valueProjectable, + "Cannot project a partial map key or value with non-primitive type. Trying to project <%s> out of <%s>", + projectField, + rowField); + + return FlinkRowData.createFieldGetter(rowType.getTypeAt(position), position); + + case LIST: + Types.ListType projectedList = projectField.type().asListType(); + Types.ListType originalList = rowField.type().asListType(); + + boolean elementProjectable = + !projectedList.elementType().isNestedType() + || projectedList.elementType().equals(originalList.elementType()); + Preconditions.checkArgument( + elementProjectable, + "Cannot project a partial list element with non-primitive type. Trying to project <%s> out of <%s>", + projectField, + rowField); + + return FlinkRowData.createFieldGetter(rowType.getTypeAt(position), position); + + default: + return FlinkRowData.createFieldGetter(rowType.getTypeAt(position), position); + } + } + + public RowData wrap(RowData row) { + // StructProjection allow wrapping null root struct object. + // See more discussions in https://github.com/apache/iceberg/pull/7517. + // RowDataProjection never allowed null root object to be wrapped. + // Hence, it is fine to enforce strict Preconditions check here. + Preconditions.checkArgument(row != null, "Invalid row data: null"); + this.rowData = row; + return this; + } + + private Object getValue(int pos) { + Preconditions.checkState(rowData != null, "Row data not wrapped"); + return getters[pos].getFieldOrNull(rowData); + } + + @Override + public int getArity() { + return getters.length; + } + + @Override + public RowKind getRowKind() { + Preconditions.checkState(rowData != null, "Row data not wrapped"); + return rowData.getRowKind(); + } + + @Override + public void setRowKind(RowKind kind) { + throw new UnsupportedOperationException("Cannot set row kind in the RowDataProjection"); + } + + @Override + public boolean isNullAt(int pos) { + return getValue(pos) == null; + } + + @Override + public boolean getBoolean(int pos) { + return (boolean) getValue(pos); + } + + @Override + public byte getByte(int pos) { + return (byte) getValue(pos); + } + + @Override + public short getShort(int pos) { + return (short) getValue(pos); + } + + @Override + public int getInt(int pos) { + return (int) getValue(pos); + } + + @Override + public long getLong(int pos) { + return (long) getValue(pos); + } + + @Override + public float getFloat(int pos) { + return (float) getValue(pos); + } + + @Override + public double getDouble(int pos) { + return (double) getValue(pos); + } + + @Override + public StringData getString(int pos) { + return (StringData) getValue(pos); + } + + @Override + public DecimalData getDecimal(int pos, int precision, int scale) { + return (DecimalData) getValue(pos); + } + + @Override + public TimestampData getTimestamp(int pos, int precision) { + return (TimestampData) getValue(pos); + } + + @Override + @SuppressWarnings("unchecked") + public RawValueData getRawValue(int pos) { + return (RawValueData) getValue(pos); + } + + @Override + public byte[] getBinary(int pos) { + return (byte[]) getValue(pos); + } + + @Override + public ArrayData getArray(int pos) { + return (ArrayData) getValue(pos); + } + + @Override + public MapData getMap(int pos) { + return (MapData) getValue(pos); + } + + @Override + public RowData getRow(int pos, int numFields) { + return (RowData) getValue(pos); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (!(o instanceof RowDataProjection)) { + return false; + } + + RowDataProjection that = (RowDataProjection) o; + return deepEquals(that); + } + + @Override + public int hashCode() { + int result = Objects.hashCode(getRowKind()); + for (int pos = 0; pos < getArity(); pos++) { + if (!isNullAt(pos)) { + // Arrays.deepHashCode handles array object properly + result = 31 * result + Arrays.deepHashCode(new Object[] {getValue(pos)}); + } + } + + return result; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(getRowKind().shortString()).append("("); + for (int pos = 0; pos < getArity(); pos++) { + if (pos != 0) { + sb.append(","); + } + // copied the behavior from Flink GenericRowData + sb.append(StringUtils.arrayAwareToString(getValue(pos))); + } + + sb.append(")"); + return sb.toString(); + } + + private boolean deepEquals(RowDataProjection other) { + if (getRowKind() != other.getRowKind()) { + return false; + } + + if (getArity() != other.getArity()) { + return false; + } + + for (int pos = 0; pos < getArity(); ++pos) { + if (isNullAt(pos) && other.isNullAt(pos)) { + continue; + } + + if ((isNullAt(pos) && !other.isNullAt(pos)) || (!isNullAt(pos) && other.isNullAt(pos))) { + return false; + } + + // Objects.deepEquals handles array object properly + if (!Objects.deepEquals(getValue(pos), other.getValue(pos))) { + return false; + } + } + + return true; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java new file mode 100644 index 000000000000..f23a7ee3d0d3 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.util.UUID; +import org.apache.avro.generic.GenericData; +import org.apache.avro.util.Utf8; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.runtime.typeutils.RowDataSerializer; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.ByteBuffers; +import org.apache.iceberg.util.DateTimeUtil; +import org.apache.iceberg.util.UUIDUtil; + +public class RowDataUtil { + + private RowDataUtil() {} + + public static Object convertConstant(Type type, Object value) { + if (value == null) { + return null; + } + + switch (type.typeId()) { + case DECIMAL: // DecimalData + Types.DecimalType decimal = (Types.DecimalType) type; + return DecimalData.fromBigDecimal((BigDecimal) value, decimal.precision(), decimal.scale()); + case STRING: // StringData + if (value instanceof Utf8) { + Utf8 utf8 = (Utf8) value; + return StringData.fromBytes(utf8.getBytes(), 0, utf8.getByteLength()); + } + return StringData.fromString(value.toString()); + case FIXED: // byte[] + if (value instanceof byte[]) { + return value; + } else if (value instanceof GenericData.Fixed) { + return ((GenericData.Fixed) value).bytes(); + } + return ByteBuffers.toByteArray((ByteBuffer) value); + case BINARY: // byte[] + return ByteBuffers.toByteArray((ByteBuffer) value); + case TIME: // int mills instead of long + return (int) ((Long) value / 1000); + case TIMESTAMP: // TimestampData + return TimestampData.fromLocalDateTime(DateTimeUtil.timestampFromMicros((Long) value)); + case UUID: + return UUIDUtil.convert((UUID) value); + default: + } + return value; + } + + /** + * Similar to the private {@link RowDataSerializer#copyRowData(RowData, RowData)} method. This + * skips the check the arity of rowType and from, because the from RowData may contains additional + * column for position deletes. Using {@link RowDataSerializer#copy(RowData, RowData)} will fail + * the arity check. + */ + public static RowData clone( + RowData from, + RowData reuse, + RowType rowType, + TypeSerializer[] fieldSerializers, + RowData.FieldGetter[] fieldGetters) { + GenericRowData ret; + if (reuse instanceof GenericRowData) { + ret = (GenericRowData) reuse; + } else { + ret = new GenericRowData(from.getArity()); + } + + ret.setRowKind(from.getRowKind()); + for (int i = 0; i < rowType.getFieldCount(); i++) { + if (!from.isNullAt(i)) { + ret.setField(i, fieldSerializers[i].copy(fieldGetters[i].getFieldOrNull(from))); + } else { + ret.setField(i, null); + } + } + + return ret; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java new file mode 100644 index 000000000000..34576a1e5c0b --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java @@ -0,0 +1,300 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.time.Duration; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.OffsetDateTime; +import java.time.ZoneOffset; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RawValueData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.ByteBuffers; + +@Internal +public class StructRowData implements RowData { + private final Types.StructType type; + private RowKind kind; + private StructLike struct; + + public StructRowData(Types.StructType type) { + this(type, RowKind.INSERT); + } + + public StructRowData(Types.StructType type, RowKind kind) { + this(type, null, kind); + } + + private StructRowData(Types.StructType type, StructLike struct) { + this(type, struct, RowKind.INSERT); + } + + private StructRowData(Types.StructType type, StructLike struct, RowKind kind) { + this.type = type; + this.struct = struct; + this.kind = kind; + } + + public StructRowData setStruct(StructLike newStruct) { + this.struct = newStruct; + return this; + } + + @Override + public int getArity() { + return struct.size(); + } + + @Override + public RowKind getRowKind() { + return kind; + } + + @Override + public void setRowKind(RowKind newKind) { + Preconditions.checkNotNull(newKind, "kind can not be null"); + this.kind = newKind; + } + + @Override + public boolean isNullAt(int pos) { + return struct.get(pos, Object.class) == null; + } + + @Override + public boolean getBoolean(int pos) { + return struct.get(pos, Boolean.class); + } + + @Override + public byte getByte(int pos) { + return (byte) (int) struct.get(pos, Integer.class); + } + + @Override + public short getShort(int pos) { + return (short) (int) struct.get(pos, Integer.class); + } + + @Override + public int getInt(int pos) { + Object integer = struct.get(pos, Object.class); + + if (integer instanceof Integer) { + return (int) integer; + } else if (integer instanceof LocalDate) { + return (int) ((LocalDate) integer).toEpochDay(); + } else if (integer instanceof LocalTime) { + return (int) (((LocalTime) integer).toNanoOfDay() / 1000_000); + } else { + throw new IllegalStateException( + "Unknown type for int field. Type name: " + integer.getClass().getName()); + } + } + + @Override + public long getLong(int pos) { + Object longVal = struct.get(pos, Object.class); + + if (longVal instanceof Long) { + return (long) longVal; + } else if (longVal instanceof OffsetDateTime) { + return Duration.between(Instant.EPOCH, (OffsetDateTime) longVal).toNanos() / 1000; + } else if (longVal instanceof LocalDate) { + return ((LocalDate) longVal).toEpochDay(); + } else if (longVal instanceof LocalTime) { + return ((LocalTime) longVal).toNanoOfDay(); + } else if (longVal instanceof LocalDateTime) { + return Duration.between(Instant.EPOCH, ((LocalDateTime) longVal).atOffset(ZoneOffset.UTC)) + .toNanos() + / 1000; + } else { + throw new IllegalStateException( + "Unknown type for long field. Type name: " + longVal.getClass().getName()); + } + } + + @Override + public float getFloat(int pos) { + return struct.get(pos, Float.class); + } + + @Override + public double getDouble(int pos) { + return struct.get(pos, Double.class); + } + + @Override + public StringData getString(int pos) { + return isNullAt(pos) ? null : getStringDataInternal(pos); + } + + private StringData getStringDataInternal(int pos) { + CharSequence seq = struct.get(pos, CharSequence.class); + return StringData.fromString(seq.toString()); + } + + @Override + public DecimalData getDecimal(int pos, int precision, int scale) { + return isNullAt(pos) + ? null + : DecimalData.fromBigDecimal(getDecimalInternal(pos), precision, scale); + } + + private BigDecimal getDecimalInternal(int pos) { + return struct.get(pos, BigDecimal.class); + } + + @Override + public TimestampData getTimestamp(int pos, int precision) { + long timeLong = getLong(pos); + return TimestampData.fromEpochMillis(timeLong / 1000, (int) (timeLong % 1000) * 1000); + } + + @Override + public RawValueData getRawValue(int pos) { + throw new UnsupportedOperationException("Not supported yet."); + } + + @Override + public byte[] getBinary(int pos) { + return isNullAt(pos) ? null : getBinaryInternal(pos); + } + + private byte[] getBinaryInternal(int pos) { + Object bytes = struct.get(pos, Object.class); + + // should only be either ByteBuffer or byte[] + if (bytes instanceof ByteBuffer) { + return ByteBuffers.toByteArray((ByteBuffer) bytes); + } else if (bytes instanceof byte[]) { + return (byte[]) bytes; + } else if (bytes instanceof UUID) { + UUID uuid = (UUID) bytes; + ByteBuffer bb = ByteBuffer.allocate(16); + bb.putLong(uuid.getMostSignificantBits()); + bb.putLong(uuid.getLeastSignificantBits()); + return bb.array(); + } else { + throw new IllegalStateException( + "Unknown type for binary field. Type name: " + bytes.getClass().getName()); + } + } + + @Override + public ArrayData getArray(int pos) { + return isNullAt(pos) + ? null + : (ArrayData) + convertValue(type.fields().get(pos).type().asListType(), struct.get(pos, List.class)); + } + + @Override + public MapData getMap(int pos) { + return isNullAt(pos) + ? null + : (MapData) + convertValue(type.fields().get(pos).type().asMapType(), struct.get(pos, Map.class)); + } + + @Override + public RowData getRow(int pos, int numFields) { + return isNullAt(pos) ? null : getStructRowData(pos); + } + + private StructRowData getStructRowData(int pos) { + return new StructRowData( + type.fields().get(pos).type().asStructType(), struct.get(pos, StructLike.class)); + } + + private Object convertValue(Type elementType, Object value) { + switch (elementType.typeId()) { + case BOOLEAN: + case INTEGER: + case DATE: + case TIME: + case LONG: + case FLOAT: + case DOUBLE: + case DECIMAL: + return value; + case TIMESTAMP: + long millisecond = (long) value / 1000; + int nanoOfMillisecond = (int) ((Long) value % 1000) * 1000; + return TimestampData.fromEpochMillis(millisecond, nanoOfMillisecond); + case STRING: + return StringData.fromString(value.toString()); + case FIXED: + case BINARY: + return ByteBuffers.toByteArray((ByteBuffer) value); + case STRUCT: + return new StructRowData(elementType.asStructType(), (StructLike) value); + case LIST: + List list = (List) value; + Object[] array = new Object[list.size()]; + + int index = 0; + for (Object element : list) { + if (element == null) { + array[index] = null; + } else { + array[index] = convertValue(elementType.asListType().elementType(), element); + } + + index += 1; + } + return new GenericArrayData(array); + case MAP: + Types.MapType mapType = elementType.asMapType(); + Set> entries = ((Map) value).entrySet(); + Map result = Maps.newHashMap(); + for (Map.Entry entry : entries) { + final Object keyValue = convertValue(mapType.keyType(), entry.getKey()); + final Object valueValue = convertValue(mapType.valueType(), entry.getValue()); + result.put(keyValue, valueValue); + } + + return new GenericMapData(result); + default: + throw new UnsupportedOperationException("Unsupported element type: " + elementType); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/DeleteOrphanFiles.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/DeleteOrphanFiles.java new file mode 100644 index 000000000000..2fce5e0b3e80 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/DeleteOrphanFiles.java @@ -0,0 +1,324 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import java.time.Duration; +import java.util.Map; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.util.OutputTag; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.actions.DeleteOrphanFiles.PrefixMismatchMode; +import org.apache.iceberg.flink.maintenance.operator.DeleteFilesProcessor; +import org.apache.iceberg.flink.maintenance.operator.FileNameReader; +import org.apache.iceberg.flink.maintenance.operator.FileUriKeySelector; +import org.apache.iceberg.flink.maintenance.operator.ListFileSystemFiles; +import org.apache.iceberg.flink.maintenance.operator.ListMetadataFiles; +import org.apache.iceberg.flink.maintenance.operator.MetadataTablePlanner; +import org.apache.iceberg.flink.maintenance.operator.OrphanFilesDetector; +import org.apache.iceberg.flink.maintenance.operator.SkipOnError; +import org.apache.iceberg.flink.maintenance.operator.TaskResultAggregator; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.relocated.com.google.common.base.Splitter; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.ThreadPools; + +/** Delete orphan files from the file system. */ +public class DeleteOrphanFiles { + + private static final Schema FILE_PATH_SCHEMA = new Schema(DataFile.FILE_PATH); + private static final ScanContext FILE_PATH_SCAN_CONTEXT = + ScanContext.builder().streaming(true).project(FILE_PATH_SCHEMA).build(); + private static final Splitter COMMA_SPLITTER = Splitter.on(","); + + @Internal + public static final OutputTag ERROR_STREAM = + new OutputTag<>("error-stream", TypeInformation.of(Exception.class)); + + static final String PLANNER_TASK_NAME = "Table Planner"; + static final String READER_TASK_NAME = "Files Reader"; + static final String FILESYSTEM_FILES_TASK_NAME = "Filesystem Files"; + static final String METADATA_FILES_TASK_NAME = "List metadata Files"; + static final String DELETE_FILES_TASK_NAME = "Delete File"; + static final String AGGREGATOR_TASK_NAME = "Orphan Files Aggregator"; + static final String FILTER_FILES_TASK_NAME = "Filter File"; + static final String SKIP_ON_ERROR_TASK_NAME = "Skip On Error"; + + public static DeleteOrphanFiles.Builder builder() { + return new DeleteOrphanFiles.Builder(); + } + + private DeleteOrphanFiles() { + // Do not instantiate directly + } + + public static class Builder extends MaintenanceTaskBuilder { + private String location; + private Duration minAge = Duration.ofDays(3); + private int planningWorkerPoolSize = ThreadPools.WORKER_THREAD_POOL_SIZE; + private int deleteBatchSize = 1000; + private boolean usePrefixListing = false; + private Map equalSchemes = + Maps.newHashMap( + ImmutableMap.of( + "s3n", "s3", + "s3a", "s3")); + private final Map equalAuthorities = Maps.newHashMap(); + private PrefixMismatchMode prefixMismatchMode = PrefixMismatchMode.ERROR; + + @Override + String maintenanceTaskName() { + return "DeleteOrphanFiles"; + } + + /** + * The location to start the recursive listing the candidate files for removal. By default, the + * {@link Table#location()} is used. + * + * @param newLocation the task will scan + * @return for chained calls + */ + public Builder location(String newLocation) { + this.location = newLocation; + return this; + } + + /** + * Whether to use prefix listing when listing files from the file system. + * + * @param newUsePrefixListing true to enable prefix listing, false otherwise + * @return for chained calls + */ + public Builder usePrefixListing(boolean newUsePrefixListing) { + this.usePrefixListing = newUsePrefixListing; + return this; + } + + /** + * Action behavior when location prefixes (schemes/authorities) mismatch. + * + * @param newPrefixMismatchMode to action when mismatch + * @return for chained calls + */ + public Builder prefixMismatchMode(PrefixMismatchMode newPrefixMismatchMode) { + this.prefixMismatchMode = newPrefixMismatchMode; + return this; + } + + /** + * The files newer than this age will not be removed. + * + * @param newMinAge of the files to be removed + * @return for chained calls + */ + public Builder minAge(Duration newMinAge) { + this.minAge = newMinAge; + return this; + } + + /** + * The worker pool size used for planning the scan of the {@link MetadataTableType#ALL_FILES} + * table. This scan is used for determining the files used by the table. + * + * @param newPlanningWorkerPoolSize for scanning + * @return for chained calls + */ + public Builder planningWorkerPoolSize(int newPlanningWorkerPoolSize) { + this.planningWorkerPoolSize = newPlanningWorkerPoolSize; + return this; + } + + /** + * Passes schemes that should be considered equal. + * + *

The key may include a comma-separated list of schemes. For instance, + * Map("s3a,s3,s3n","s3"). + * + * @param newEqualSchemes list of equal schemes + * @return this for method chaining + */ + public Builder equalSchemes(Map newEqualSchemes) { + equalSchemes.putAll(flattenMap(newEqualSchemes)); + return this; + } + + /** + * Passes authorities that should be considered equal. + * + *

The key may include a comma-separate list of authorities. For instance, + * Map("s1name,s2name","servicename"). + * + * @param newEqualAuthorities list of equal authorities + * @return this for method chaining + */ + public Builder equalAuthorities(Map newEqualAuthorities) { + equalAuthorities.putAll(flattenMap(newEqualAuthorities)); + return this; + } + + /** + * Size of the batch used to deleting the files. + * + * @param newDeleteBatchSize number of batch file + * @return for chained calls + */ + public Builder deleteBatchSize(int newDeleteBatchSize) { + this.deleteBatchSize = newDeleteBatchSize; + return this; + } + + @Override + DataStream append(DataStream trigger) { + tableLoader().open(); + + // Collect all data files + SingleOutputStreamOperator splits = + trigger + .process( + new MetadataTablePlanner( + taskName(), + index(), + tableLoader(), + FILE_PATH_SCAN_CONTEXT, + MetadataTableType.ALL_FILES, + planningWorkerPoolSize)) + .name(operatorName(PLANNER_TASK_NAME)) + .uid(PLANNER_TASK_NAME + uidSuffix()) + .slotSharingGroup(slotSharingGroup()) + .forceNonParallel(); + + // Read the records and get all data files + SingleOutputStreamOperator tableDataFiles = + splits + .rebalance() + .process( + new FileNameReader( + taskName(), + index(), + tableLoader(), + FILE_PATH_SCHEMA, + FILE_PATH_SCAN_CONTEXT, + MetadataTableType.ALL_FILES)) + .name(operatorName(READER_TASK_NAME)) + .uid(READER_TASK_NAME + uidSuffix()) + .slotSharingGroup(slotSharingGroup()) + .setParallelism(parallelism()); + + // Collect all meta data files + SingleOutputStreamOperator tableMetadataFiles = + trigger + .process(new ListMetadataFiles(taskName(), index(), tableLoader())) + .name(operatorName(METADATA_FILES_TASK_NAME)) + .uid(METADATA_FILES_TASK_NAME + uidSuffix()) + .slotSharingGroup(slotSharingGroup()) + .forceNonParallel(); + + // List the all file system files + SingleOutputStreamOperator allFsFiles = + trigger + .process( + new ListFileSystemFiles( + taskName(), + index(), + tableLoader(), + location, + minAge.toMillis(), + usePrefixListing)) + .name(operatorName(FILESYSTEM_FILES_TASK_NAME)) + .uid(FILESYSTEM_FILES_TASK_NAME + uidSuffix()) + .slotSharingGroup(slotSharingGroup()) + .forceNonParallel(); + + SingleOutputStreamOperator filesToDelete = + tableMetadataFiles + .union(tableDataFiles) + .keyBy(new FileUriKeySelector(equalSchemes, equalAuthorities)) + .connect(allFsFiles.keyBy(new FileUriKeySelector(equalSchemes, equalAuthorities))) + .process(new OrphanFilesDetector(prefixMismatchMode, equalSchemes, equalAuthorities)) + .slotSharingGroup(slotSharingGroup()) + .name(operatorName(FILTER_FILES_TASK_NAME)) + .uid(FILTER_FILES_TASK_NAME + uidSuffix()) + .setParallelism(parallelism()); + + DataStream errorStream = + tableMetadataFiles + .getSideOutput(ERROR_STREAM) + .union( + allFsFiles.getSideOutput(ERROR_STREAM), + tableDataFiles.getSideOutput(ERROR_STREAM), + splits.getSideOutput(ERROR_STREAM), + filesToDelete.getSideOutput(ERROR_STREAM)); + + // Stop deleting the files if there is an error + SingleOutputStreamOperator filesOrSkip = + filesToDelete + .connect(errorStream) + .transform( + operatorName(SKIP_ON_ERROR_TASK_NAME), + TypeInformation.of(String.class), + new SkipOnError()) + .uid(SKIP_ON_ERROR_TASK_NAME + uidSuffix()) + .slotSharingGroup(slotSharingGroup()) + .forceNonParallel(); + + // delete the files + filesOrSkip + .rebalance() + .transform( + operatorName(DELETE_FILES_TASK_NAME), + TypeInformation.of(Void.class), + new DeleteFilesProcessor( + tableLoader().loadTable(), taskName(), index(), deleteBatchSize)) + .uid(DELETE_FILES_TASK_NAME + uidSuffix()) + .slotSharingGroup(slotSharingGroup()) + .setParallelism(parallelism()); + + // Ignore the file deletion result and return the DataStream directly + return trigger + .connect(errorStream) + .transform( + operatorName(AGGREGATOR_TASK_NAME), + TypeInformation.of(TaskResult.class), + new TaskResultAggregator(tableName(), taskName(), index())) + .uid(AGGREGATOR_TASK_NAME + uidSuffix()) + .slotSharingGroup(slotSharingGroup()) + .forceNonParallel(); + } + } + + private static Map flattenMap(Map map) { + Map flattenedMap = Maps.newHashMap(); + if (map != null) { + for (String key : map.keySet()) { + String value = map.get(key); + for (String splitKey : COMMA_SPLITTER.split(key)) { + flattenedMap.put(splitKey.trim(), value.trim()); + } + } + } + + return flattenedMap; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ExpireSnapshots.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ExpireSnapshots.java new file mode 100644 index 000000000000..628a91141474 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ExpireSnapshots.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import java.time.Duration; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.flink.maintenance.operator.DeleteFilesProcessor; +import org.apache.iceberg.flink.maintenance.operator.ExpireSnapshotsProcessor; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** Deletes expired snapshots and the corresponding files. */ +public class ExpireSnapshots { + private static final int DELETE_BATCH_SIZE_DEFAULT = 1000; + private static final String EXECUTOR_OPERATOR_NAME = "Expire Snapshot"; + @VisibleForTesting static final String DELETE_FILES_OPERATOR_NAME = "Delete file"; + + private ExpireSnapshots() {} + + /** Creates the builder for creating a stream which expires snapshots for the table. */ + public static Builder builder() { + return new Builder(); + } + + public static class Builder extends MaintenanceTaskBuilder { + private Duration maxSnapshotAge = null; + private Integer numSnapshots = null; + private Integer planningWorkerPoolSize; + private int deleteBatchSize = DELETE_BATCH_SIZE_DEFAULT; + private Boolean cleanExpiredMetadata = null; + + @Override + String maintenanceTaskName() { + return "ExpireSnapshots"; + } + + /** + * The snapshots older than this age will be removed. + * + * @param newMaxSnapshotAge of the snapshots to be removed + */ + public Builder maxSnapshotAge(Duration newMaxSnapshotAge) { + this.maxSnapshotAge = newMaxSnapshotAge; + return this; + } + + /** + * The minimum number of {@link Snapshot}s to retain. For more details description see {@link + * org.apache.iceberg.ExpireSnapshots#retainLast(int)}. + * + * @param newNumSnapshots number of snapshots to retain + */ + public Builder retainLast(int newNumSnapshots) { + this.numSnapshots = newNumSnapshots; + return this; + } + + /** + * The worker pool size used to calculate the files to delete. If not set, the shared worker + * pool is used. + * + * @param newPlanningWorkerPoolSize for planning files to delete + */ + public Builder planningWorkerPoolSize(int newPlanningWorkerPoolSize) { + this.planningWorkerPoolSize = newPlanningWorkerPoolSize; + return this; + } + + /** + * Size of the batch used to deleting the files. + * + * @param newDeleteBatchSize used for deleting + */ + public Builder deleteBatchSize(int newDeleteBatchSize) { + this.deleteBatchSize = newDeleteBatchSize; + return this; + } + + /** + * Expires unused table metadata such as partition specs and schemas. + * + * @param newCleanExpiredMetadata remove unused partition specs, schemas, or other metadata when + * true + * @return this for method chaining + */ + public Builder cleanExpiredMetadata(boolean newCleanExpiredMetadata) { + this.cleanExpiredMetadata = newCleanExpiredMetadata; + return this; + } + + @Override + DataStream append(DataStream trigger) { + Preconditions.checkNotNull(tableLoader(), "TableLoader should not be null"); + + SingleOutputStreamOperator result = + trigger + .process( + new ExpireSnapshotsProcessor( + tableLoader(), + maxSnapshotAge == null ? null : maxSnapshotAge.toMillis(), + numSnapshots, + planningWorkerPoolSize, + cleanExpiredMetadata)) + .name(operatorName(EXECUTOR_OPERATOR_NAME)) + .uid(EXECUTOR_OPERATOR_NAME + uidSuffix()) + .slotSharingGroup(slotSharingGroup()) + .forceNonParallel(); + + result + .getSideOutput(ExpireSnapshotsProcessor.DELETE_STREAM) + .rebalance() + .transform( + operatorName(DELETE_FILES_OPERATOR_NAME), + TypeInformation.of(Void.class), + new DeleteFilesProcessor( + tableLoader().loadTable(), taskName(), index(), deleteBatchSize)) + .uid(DELETE_FILES_OPERATOR_NAME + uidSuffix()) + .slotSharingGroup(slotSharingGroup()) + .setParallelism(parallelism()); + + // Ignore the file deletion result and return the DataStream directly + return result; + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/FlinkMaintenanceConfig.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/FlinkMaintenanceConfig.java new file mode 100644 index 000000000000..0c88abf82099 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/FlinkMaintenanceConfig.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import java.util.Map; +import org.apache.flink.api.common.ExecutionConfig; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.graph.StreamGraphGenerator; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.FlinkConfParser; + +public class FlinkMaintenanceConfig { + + public static final String PREFIX = "flink-maintenance."; + + public static final String LOCK_CHECK_DELAY = PREFIX + "lock-check-delay-seconds"; + public static final ConfigOption LOCK_CHECK_DELAY_OPTION = + ConfigOptions.key(LOCK_CHECK_DELAY) + .longType() + .defaultValue(TableMaintenance.LOCK_CHECK_DELAY_SECOND_DEFAULT) + .withDescription( + "The delay time (in seconds) between each lock check during maintenance operations such as " + + "rewriting data files, manifest files, expiring snapshots, and deleting orphan files."); + + public static final String PARALLELISM = PREFIX + "parallelism"; + public static final ConfigOption PARALLELISM_OPTION = + ConfigOptions.key(PARALLELISM) + .intType() + .defaultValue(ExecutionConfig.PARALLELISM_DEFAULT) + .withDescription("The number of parallel tasks for the maintenance action."); + + public static final String RATE_LIMIT = PREFIX + "rate-limit-seconds"; + public static final ConfigOption RATE_LIMIT_OPTION = + ConfigOptions.key(RATE_LIMIT) + .longType() + .defaultValue(TableMaintenance.RATE_LIMIT_SECOND_DEFAULT) + .withDescription( + "The rate limit (in seconds) for maintenance operations. " + + "This controls how many operations can be performed per second."); + + public static final String SLOT_SHARING_GROUP = PREFIX + "slot-sharing-group"; + public static final ConfigOption SLOT_SHARING_GROUP_OPTION = + ConfigOptions.key(SLOT_SHARING_GROUP) + .stringType() + .defaultValue(StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP) + .withDescription( + "The slot sharing group for maintenance tasks. " + + "Determines which operators can share slots in the Flink execution environment."); + + private final FlinkConfParser confParser; + private final Table table; + private final Map writeProperties; + private final ReadableConfig readableConfig; + + public FlinkMaintenanceConfig( + Table table, Map writeOptions, ReadableConfig readableConfig) { + this.table = table; + this.readableConfig = readableConfig; + this.writeProperties = writeOptions; + this.confParser = new FlinkConfParser(table, writeOptions, readableConfig); + } + + /** Gets the rate limit value (in seconds) for maintenance operations. */ + public long rateLimit() { + return confParser + .longConf() + .option(RATE_LIMIT) + .flinkConfig(RATE_LIMIT_OPTION) + .defaultValue(RATE_LIMIT_OPTION.defaultValue()) + .parse(); + } + + /** Gets the parallelism value for maintenance tasks. */ + public int parallelism() { + return confParser + .intConf() + .option(PARALLELISM) + .flinkConfig(PARALLELISM_OPTION) + .defaultValue(PARALLELISM_OPTION.defaultValue()) + .parse(); + } + + /** Gets the lock check delay value (in seconds). */ + public long lockCheckDelay() { + return confParser + .longConf() + .option(LOCK_CHECK_DELAY) + .flinkConfig(LOCK_CHECK_DELAY_OPTION) + .defaultValue(LOCK_CHECK_DELAY_OPTION.defaultValue()) + .parse(); + } + + /** Gets the slot sharing group value for maintenance tasks. */ + public String slotSharingGroup() { + return confParser + .stringConf() + .option(SLOT_SHARING_GROUP) + .flinkConfig(SLOT_SHARING_GROUP_OPTION) + .defaultValue(SLOT_SHARING_GROUP_OPTION.defaultValue()) + .parse(); + } + + public RewriteDataFilesConfig createRewriteDataFilesConfig() { + return new RewriteDataFilesConfig(table, writeProperties, readableConfig); + } + + public LockConfig createLockConfig() { + return new LockConfig(table, writeProperties, readableConfig); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java new file mode 100644 index 000000000000..f68605accc57 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java @@ -0,0 +1,325 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import java.io.IOException; +import java.sql.DatabaseMetaData; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.SQLNonTransientConnectionException; +import java.sql.SQLTimeoutException; +import java.sql.SQLTransientConnectionException; +import java.util.Map; +import java.util.UUID; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.jdbc.JdbcClientPool; +import org.apache.iceberg.jdbc.UncheckedInterruptedException; +import org.apache.iceberg.jdbc.UncheckedSQLException; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.util.PropertyUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** JDBC table backed implementation of the {@link TriggerLockFactory}. */ +public class JdbcLockFactory implements TriggerLockFactory { + private static final Logger LOG = LoggerFactory.getLogger(JdbcLockFactory.class); + + @Internal + public static final String INIT_LOCK_TABLES_PROPERTY = + "flink-maintenance.lock.jdbc.init-lock-tables"; + + private static final String LOCK_TABLE_NAME = "flink_maintenance_lock"; + private static final int LOCK_ID_MAX_LENGTH = 100; + private static final String CREATE_LOCK_TABLE_SQL = + String.format( + "CREATE TABLE %s " + + "(LOCK_TYPE CHAR(1) NOT NULL, " + + "LOCK_ID VARCHAR(%s) NOT NULL, " + + "INSTANCE_ID CHAR(36) NOT NULL, PRIMARY KEY (LOCK_TYPE, LOCK_ID))", + LOCK_TABLE_NAME, LOCK_ID_MAX_LENGTH); + + private static final String CREATE_LOCK_SQL = + String.format( + "INSERT INTO %s (LOCK_TYPE, LOCK_ID, INSTANCE_ID) VALUES (?, ?, ?)", LOCK_TABLE_NAME); + private static final String GET_LOCK_SQL = + String.format("SELECT INSTANCE_ID FROM %s WHERE LOCK_TYPE=? AND LOCK_ID=?", LOCK_TABLE_NAME); + private static final String DELETE_LOCK_SQL = + String.format( + "DELETE FROM %s WHERE LOCK_TYPE=? AND LOCK_ID=? AND INSTANCE_ID=?", LOCK_TABLE_NAME); + + private final String uri; + private final String lockId; + private final Map properties; + private transient JdbcClientPool pool; + + /** + * Creates a new {@link TriggerLockFactory}. The lockId should be unique between the users of the + * same uri. + * + * @param uri of the jdbc connection + * @param lockId which should identify the job and the table + * @param properties used for creating the jdbc connection pool + */ + public JdbcLockFactory(String uri, String lockId, Map properties) { + Preconditions.checkNotNull(uri, "JDBC connection URI is required"); + Preconditions.checkNotNull(properties, "Properties map is required"); + Preconditions.checkArgument( + lockId.length() < LOCK_ID_MAX_LENGTH, + "Invalid prefix length: lockId should be shorter than %s", + LOCK_ID_MAX_LENGTH); + this.uri = uri; + this.lockId = lockId; + this.properties = properties; + } + + @Override + public void open() { + this.pool = new JdbcClientPool(1, uri, properties); + + if (PropertyUtil.propertyAsBoolean(properties, INIT_LOCK_TABLES_PROPERTY, false)) { + initializeLockTables(); + } + } + + /** Only used in testing to share the jdbc pool */ + @VisibleForTesting + void open(JdbcLockFactory other) { + this.pool = other.pool; + } + + @Override + public Lock createLock() { + return new JdbcLock(pool, lockId, Type.MAINTENANCE); + } + + @Override + public Lock createRecoveryLock() { + return new JdbcLock(pool, lockId, Type.RECOVERY); + } + + @Override + public void close() throws IOException { + pool.close(); + } + + private void initializeLockTables() { + LOG.debug("Creating database tables (if missing) to store table maintenance locks"); + try { + pool.run( + conn -> { + DatabaseMetaData dbMeta = conn.getMetaData(); + try (ResultSet rs = + dbMeta.getTables( + null /* catalog name */, + null /* schemaPattern */, + LOCK_TABLE_NAME /* tableNamePattern */, + null /* types */)) { + if (rs.next()) { + LOG.debug("Flink maintenance lock table already exists"); + return true; + } + } + LOG.info("Creating Flink maintenance lock table {}", LOCK_TABLE_NAME); + try (PreparedStatement ps = conn.prepareStatement(CREATE_LOCK_TABLE_SQL)) { + ps.execute(); + } + + return true; + }); + } catch (SQLTimeoutException e) { + throw new UncheckedSQLException( + e, "Cannot initialize JDBC table maintenance lock: Query timed out"); + } catch (SQLTransientConnectionException | SQLNonTransientConnectionException e) { + throw new UncheckedSQLException( + e, "Cannot initialize JDBC table maintenance lock: Connection failed"); + } catch (SQLException e) { + throw new UncheckedSQLException(e, "Cannot initialize JDBC table maintenance lock"); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new UncheckedInterruptedException(e, "Interrupted in call to initialize"); + } + } + + private static class JdbcLock implements TriggerLockFactory.Lock { + private final JdbcClientPool pool; + private final String lockId; + private final Type type; + + private JdbcLock(JdbcClientPool pool, String lockId, Type type) { + this.pool = pool; + this.lockId = lockId; + this.type = type; + } + + @Override + public boolean tryLock() { + if (isHeld()) { + LOG.info("Lock is already held for {}", this); + return false; + } + + String newInstanceId = UUID.randomUUID().toString(); + try { + return pool.run( + conn -> { + try (PreparedStatement sql = conn.prepareStatement(CREATE_LOCK_SQL)) { + sql.setString(1, type.key); + sql.setString(2, lockId); + sql.setString(3, newInstanceId); + int count = sql.executeUpdate(); + LOG.info( + "Created {} lock with instanceId {} with row count {}", + this, + newInstanceId, + count); + return count == 1; + } + }); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new UncheckedInterruptedException(e, "Interrupted during tryLock"); + } catch (SQLException e) { + // SQL exception happened when creating the lock. Check if the lock creation was + // successful behind the scenes. + if (newInstanceId.equals(instanceId())) { + return true; + } else { + throw new UncheckedSQLException(e, "Failed to create %s lock", this); + } + } + } + + @SuppressWarnings("checkstyle:NestedTryDepth") + @Override + public boolean isHeld() { + try { + return pool.run( + conn -> { + try (PreparedStatement sql = conn.prepareStatement(GET_LOCK_SQL)) { + sql.setString(1, type.key); + sql.setString(2, lockId); + try (ResultSet rs = sql.executeQuery()) { + return rs.next(); + } + } + }); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new UncheckedInterruptedException(e, "Interrupted during isHeld"); + } catch (SQLException e) { + // SQL exception happened when getting lock information + throw new UncheckedSQLException(e, "Failed to check the state of the lock %s", this); + } + } + + @SuppressWarnings("checkstyle:NestedTryDepth") + @Override + public void unlock() { + try { + // Possible concurrency issue: + // - `unlock` and `tryLock` happens at the same time when there is an existing lock + // + // Steps: + // 1. `unlock` removes the lock in the database, but there is a temporary connection failure + // 2. `lock` finds that there is no lock, so creates a new lock + // 3. `unlock` retries the lock removal and removes the new lock + // + // To prevent the situation above we fetch the current lockId, and remove the lock + // only with the given id. + String instanceId = instanceId(); + + if (instanceId != null) { + pool.run( + conn -> { + try (PreparedStatement sql = conn.prepareStatement(DELETE_LOCK_SQL)) { + sql.setString(1, type.key); + sql.setString(2, lockId); + sql.setString(3, instanceId); + long count = sql.executeUpdate(); + LOG.info( + "Deleted {} lock with instanceId {} with row count {}", + this, + instanceId, + count); + } catch (SQLException e) { + // SQL exception happened when deleting lock information + throw new UncheckedSQLException( + e, "Failed to delete %s lock with instanceId %s", this, instanceId); + } + + return null; + }); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new UncheckedInterruptedException(e, "Interrupted during unlock"); + } catch (SQLException e) { + // SQL exception happened when getting/updating lock information + throw new UncheckedSQLException(e, "Failed to remove lock %s", this); + } + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this).add("type", type).add("lockId", lockId).toString(); + } + + @SuppressWarnings("checkstyle:NestedTryDepth") + private String instanceId() { + try { + return pool.run( + conn -> { + try (PreparedStatement sql = conn.prepareStatement(GET_LOCK_SQL)) { + sql.setString(1, type.key); + sql.setString(2, lockId); + try (ResultSet rs = sql.executeQuery()) { + if (rs.next()) { + return rs.getString(1); + } else { + return null; + } + } + } catch (SQLException e) { + // SQL exception happened when getting lock information + throw new UncheckedSQLException(e, "Failed to get lock information for %s", type); + } + }); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new UncheckedInterruptedException(e, "Interrupted during unlock"); + } catch (SQLException e) { + throw new UncheckedSQLException(e, "Failed to get lock information for %s", type); + } + } + } + + private enum Type { + MAINTENANCE("m"), + RECOVERY("r"); + + private final String key; + + Type(String key) { + this.key = key; + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/LockConfig.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/LockConfig.java new file mode 100644 index 000000000000..b28731f91c15 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/LockConfig.java @@ -0,0 +1,218 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import java.util.Map; +import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.FlinkConfParser; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +public class LockConfig { + + public static final String PREFIX = FlinkMaintenanceConfig.PREFIX + "lock."; + + public static final ConfigOption LOCK_TYPE_OPTION = + ConfigOptions.key(PREFIX + "type") + .stringType() + .defaultValue(StringUtils.EMPTY) + .withDescription("The type of lock to use, e.g., jdbc or zookeeper."); + + public static final ConfigOption LOCK_ID_OPTION = + ConfigOptions.key(PREFIX + "lock-id") + .stringType() + .defaultValue(StringUtils.EMPTY) + .withDescription("The unique identifier for the lock."); + + public static class JdbcLockConfig { + + public static final String JDBC = "jdbc"; + + public static final ConfigOption JDBC_URI_OPTION = + ConfigOptions.key(PREFIX + JDBC + ".uri") + .stringType() + .defaultValue(StringUtils.EMPTY) + .withDescription("The URI of the JDBC connection for acquiring the lock."); + + public static final ConfigOption JDBC_INIT_LOCK_TABLE_OPTION = + ConfigOptions.key(PREFIX + JDBC + ".init-lock-table") + .stringType() + .defaultValue(Boolean.FALSE.toString()) + .withDescription("Whether to initialize the lock table in the JDBC database."); + } + + public static class ZkLockConfig { + public static final String ZK = "zookeeper"; + + public static final ConfigOption ZK_URI_OPTION = + ConfigOptions.key(PREFIX + ZK + ".uri") + .stringType() + .defaultValue(StringUtils.EMPTY) + .withDescription("The URI of the Zookeeper service for acquiring the lock."); + + public static final ConfigOption ZK_SESSION_TIMEOUT_MS_OPTION = + ConfigOptions.key(PREFIX + ZK + ".session-timeout-ms") + .intType() + .defaultValue(60000) + .withDescription("The session timeout (in milliseconds) for the Zookeeper client."); + + public static final ConfigOption ZK_CONNECTION_TIMEOUT_MS_OPTION = + ConfigOptions.key(PREFIX + ZK + ".connection-timeout-ms") + .intType() + .defaultValue(15000) + .withDescription("The connection timeout (in milliseconds) for the Zookeeper client."); + + public static final ConfigOption ZK_BASE_SLEEP_MS_OPTION = + ConfigOptions.key(PREFIX + ZK + ".base-sleep-ms") + .intType() + .defaultValue(3000) + .withDescription( + "The base sleep time (in milliseconds) between retries for the Zookeeper client."); + + public static final ConfigOption ZK_MAX_RETRIES_OPTION = + ConfigOptions.key(PREFIX + ZK + ".max-retries") + .intType() + .defaultValue(3) + .withDescription("The maximum number of retries for the Zookeeper client."); + } + + private final FlinkConfParser confParser; + private final Map writeProperties; + private final Map setProperties; + + public LockConfig(Table table, Map writeOptions, ReadableConfig readableConfig) { + this.writeProperties = writeOptions; + this.setProperties = readableConfig.toMap(); + this.confParser = new FlinkConfParser(table, writeOptions, readableConfig); + } + + /** Gets the lock type configuration value (e.g., jdbc or zookeeper). */ + public String lockType() { + return confParser + .stringConf() + .option(LOCK_TYPE_OPTION.key()) + .flinkConfig(LOCK_TYPE_OPTION) + .defaultValue(LOCK_TYPE_OPTION.defaultValue()) + .parse(); + } + + /** Gets the lock ID configuration value. If blank, returns the provided default value. */ + public String lockId(String defaultValue) { + String lockId = + confParser + .stringConf() + .option(LOCK_ID_OPTION.key()) + .flinkConfig(LOCK_ID_OPTION) + .defaultValue(LOCK_ID_OPTION.defaultValue()) + .parse(); + if (StringUtils.isBlank(lockId)) { + return defaultValue; + } + + return lockId; + } + + /** Gets the JDBC URI configuration value. */ + public String jdbcUri() { + return confParser + .stringConf() + .option(JdbcLockConfig.JDBC_URI_OPTION.key()) + .flinkConfig(JdbcLockConfig.JDBC_URI_OPTION) + .defaultValue(JdbcLockConfig.JDBC_URI_OPTION.defaultValue()) + .parse(); + } + + /** Gets the configuration value for initializing the JDBC lock table. */ + public String jdbcInitTable() { + return confParser + .stringConf() + .option(JdbcLockConfig.JDBC_INIT_LOCK_TABLE_OPTION.key()) + .flinkConfig(JdbcLockConfig.JDBC_INIT_LOCK_TABLE_OPTION) + .defaultValue(JdbcLockConfig.JDBC_INIT_LOCK_TABLE_OPTION.defaultValue()) + .parse(); + } + + /** Gets the Zookeeper URI configuration value. */ + public String zkUri() { + return confParser + .stringConf() + .option(ZkLockConfig.ZK_URI_OPTION.key()) + .flinkConfig(ZkLockConfig.ZK_URI_OPTION) + .defaultValue(ZkLockConfig.ZK_URI_OPTION.defaultValue()) + .parse(); + } + + /** Gets the Zookeeper session timeout configuration (in milliseconds). */ + public int zkSessionTimeoutMs() { + return confParser + .intConf() + .option(ZkLockConfig.ZK_SESSION_TIMEOUT_MS_OPTION.key()) + .flinkConfig(ZkLockConfig.ZK_SESSION_TIMEOUT_MS_OPTION) + .defaultValue(ZkLockConfig.ZK_SESSION_TIMEOUT_MS_OPTION.defaultValue()) + .parse(); + } + + /** Gets the Zookeeper connection timeout configuration (in milliseconds). */ + public int zkConnectionTimeoutMs() { + return confParser + .intConf() + .option(ZkLockConfig.ZK_CONNECTION_TIMEOUT_MS_OPTION.key()) + .flinkConfig(ZkLockConfig.ZK_CONNECTION_TIMEOUT_MS_OPTION) + .defaultValue(ZkLockConfig.ZK_CONNECTION_TIMEOUT_MS_OPTION.defaultValue()) + .parse(); + } + + /** Gets the Zookeeper base sleep time configuration (in milliseconds). */ + public int zkBaseSleepMs() { + return confParser + .intConf() + .option(ZkLockConfig.ZK_BASE_SLEEP_MS_OPTION.key()) + .flinkConfig(ZkLockConfig.ZK_BASE_SLEEP_MS_OPTION) + .defaultValue(ZkLockConfig.ZK_BASE_SLEEP_MS_OPTION.defaultValue()) + .parse(); + } + + /** Gets the Zookeeper maximum retry count configuration. */ + public int zkMaxRetries() { + return confParser + .intConf() + .option(ZkLockConfig.ZK_MAX_RETRIES_OPTION.key()) + .flinkConfig(ZkLockConfig.ZK_MAX_RETRIES_OPTION) + .defaultValue(ZkLockConfig.ZK_MAX_RETRIES_OPTION.defaultValue()) + .parse(); + } + + public Map properties() { + Map mergeConfig = Maps.newHashMap(); + mergeConfig.putAll(setProperties); + mergeConfig.putAll(writeProperties); + return mergeConfig.entrySet().stream() + .filter(entry -> entry.getKey().startsWith(PREFIX)) + .collect( + Collectors.toMap( + entry -> entry.getKey().substring(PREFIX.length()), + Map.Entry::getValue, + (existing, replacement) -> existing, + Maps::newHashMap)); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskBuilder.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskBuilder.java new file mode 100644 index 000000000000..5d5f17b0a80e --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskBuilder.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import java.time.Duration; +import org.apache.flink.annotation.Experimental; +import org.apache.flink.api.common.operators.util.OperatorValidationUtils; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.maintenance.operator.TriggerEvaluator; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +@Experimental +@SuppressWarnings("unchecked") +public abstract class MaintenanceTaskBuilder> { + private int index; + private String taskName; + private String tableName; + private TableLoader tableLoader; + private String uidSuffix = null; + private String slotSharingGroup = null; + private Integer parallelism = null; + private final TriggerEvaluator.Builder triggerEvaluator = new TriggerEvaluator.Builder(); + + abstract DataStream append(DataStream sourceStream); + + abstract String maintenanceTaskName(); + + /** + * After a given number of Iceberg table commits since the last run, starts the downstream job. + * + * @param commitCount after the downstream job should be started + */ + public T scheduleOnCommitCount(int commitCount) { + triggerEvaluator.commitCount(commitCount); + return (T) this; + } + + /** + * After a given number of new data files since the last run, starts the downstream job. + * + * @param dataFileCount after the downstream job should be started + */ + public T scheduleOnDataFileCount(int dataFileCount) { + triggerEvaluator.dataFileCount(dataFileCount); + return (T) this; + } + + /** + * After a given aggregated data file size since the last run, starts the downstream job. + * + * @param dataFileSizeInBytes after the downstream job should be started + */ + public T scheduleOnDataFileSize(long dataFileSizeInBytes) { + triggerEvaluator.dataFileSizeInBytes(dataFileSizeInBytes); + return (T) this; + } + + /** + * After a given number of new positional delete files since the last run, starts the downstream + * job. + * + * @param posDeleteFileCount after the downstream job should be started + */ + public T scheduleOnPosDeleteFileCount(int posDeleteFileCount) { + triggerEvaluator.posDeleteFileCount(posDeleteFileCount); + return (T) this; + } + + /** + * After a given number of new positional delete records since the last run, starts the downstream + * job. + * + * @param posDeleteRecordCount after the downstream job should be started + */ + public T scheduleOnPosDeleteRecordCount(long posDeleteRecordCount) { + triggerEvaluator.posDeleteRecordCount(posDeleteRecordCount); + return (T) this; + } + + /** + * After a given number of new equality delete files since the last run, starts the downstream + * job. + * + * @param eqDeleteFileCount after the downstream job should be started + */ + public T scheduleOnEqDeleteFileCount(int eqDeleteFileCount) { + triggerEvaluator.eqDeleteFileCount(eqDeleteFileCount); + return (T) this; + } + + /** + * After a given number of new equality delete records since the last run, starts the downstream + * job. + * + * @param eqDeleteRecordCount after the downstream job should be started + */ + public T scheduleOnEqDeleteRecordCount(long eqDeleteRecordCount) { + triggerEvaluator.eqDeleteRecordCount(eqDeleteRecordCount); + return (T) this; + } + + /** + * After a given time since the last run, starts the downstream job. + * + * @param interval after the downstream job should be started + */ + public T scheduleOnInterval(Duration interval) { + triggerEvaluator.timeout(interval); + return (T) this; + } + + /** + * The suffix used for the generated {@link org.apache.flink.api.dag.Transformation}'s uid. + * + * @param newUidSuffix for the transformations + */ + public T uidSuffix(String newUidSuffix) { + this.uidSuffix = newUidSuffix; + return (T) this; + } + + /** + * The {@link SingleOutputStreamOperator#slotSharingGroup(String)} for all the operators of the + * generated stream. Could be used to separate the resources used by this task. + * + * @param newSlotSharingGroup to be used for the operators + */ + public T slotSharingGroup(String newSlotSharingGroup) { + this.slotSharingGroup = newSlotSharingGroup; + return (T) this; + } + + /** + * Sets the parallelism for the stream. + * + * @param newParallelism the required parallelism + */ + public T parallelism(int newParallelism) { + OperatorValidationUtils.validateParallelism(newParallelism); + this.parallelism = newParallelism; + return (T) this; + } + + protected int index() { + return index; + } + + protected String taskName() { + return taskName; + } + + protected String tableName() { + return tableName; + } + + protected TableLoader tableLoader() { + return tableLoader; + } + + protected String uidSuffix() { + return uidSuffix; + } + + protected String slotSharingGroup() { + return slotSharingGroup; + } + + protected Integer parallelism() { + return parallelism; + } + + protected String operatorName(String operatorNameBase) { + return operatorNameBase + "[" + index() + "]"; + } + + TriggerEvaluator evaluator() { + return triggerEvaluator.build(); + } + + DataStream append( + DataStream sourceStream, + String newTableName, + String newTaskName, + int taskIndex, + TableLoader newTableLoader, + String defaultUidSuffix, + String defaultSlotSharingGroup, + int defaultParallelism) { + Preconditions.checkNotNull(newTaskName, "Task name should not be null"); + Preconditions.checkNotNull(newTableLoader, "TableLoader should not be null"); + + this.index = taskIndex; + this.taskName = newTaskName; + this.tableName = newTableName; + this.tableLoader = newTableLoader; + + if (uidSuffix == null) { + uidSuffix = this.taskName + "_" + index + "_" + defaultUidSuffix; + } + + if (parallelism == null) { + parallelism = defaultParallelism; + } + + if (slotSharingGroup == null) { + slotSharingGroup = defaultSlotSharingGroup; + } + + return append(sourceStream); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java new file mode 100644 index 000000000000..bedf70725a63 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java @@ -0,0 +1,294 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import java.time.Duration; +import java.util.Map; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.iceberg.actions.BinPackRewriteFilePlanner; +import org.apache.iceberg.actions.SizeBasedFileRewritePlanner; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.maintenance.operator.DataFileRewriteCommitter; +import org.apache.iceberg.flink.maintenance.operator.DataFileRewritePlanner; +import org.apache.iceberg.flink.maintenance.operator.DataFileRewriteRunner; +import org.apache.iceberg.flink.maintenance.operator.TaskResultAggregator; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +/** + * Creates the data file rewriter data stream. Which runs a single iteration of the task for every + * {@link Trigger} event. + */ +public class RewriteDataFiles { + static final String PLANNER_TASK_NAME = "RDF Planner"; + static final String REWRITE_TASK_NAME = "Rewrite"; + static final String COMMIT_TASK_NAME = "Rewrite commit"; + static final String AGGREGATOR_TASK_NAME = "Rewrite aggregator"; + + private RewriteDataFiles() {} + + /** Creates the builder for a stream which rewrites data files for the table. */ + public static Builder builder() { + return new Builder(); + } + + public static class Builder extends MaintenanceTaskBuilder { + private boolean partialProgressEnabled = + org.apache.iceberg.actions.RewriteDataFiles.PARTIAL_PROGRESS_ENABLED_DEFAULT; + private int partialProgressMaxCommits = + org.apache.iceberg.actions.RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS_DEFAULT; + private final Map rewriteOptions = Maps.newHashMapWithExpectedSize(6); + private long maxRewriteBytes = Long.MAX_VALUE; + private Expression filter = Expressions.alwaysTrue(); + + @Override + String maintenanceTaskName() { + return "RewriteDataFiles"; + } + + /** + * Allows committing compacted data files in batches. See {@link + * org.apache.iceberg.actions.RewriteDataFiles#PARTIAL_PROGRESS_ENABLED} for more details. + * + * @param newPartialProgressEnabled to enable partial commits + */ + public Builder partialProgressEnabled(boolean newPartialProgressEnabled) { + this.partialProgressEnabled = newPartialProgressEnabled; + return this; + } + + /** + * Configures the size of batches if {@link #partialProgressEnabled}. See {@link + * org.apache.iceberg.actions.RewriteDataFiles#PARTIAL_PROGRESS_MAX_COMMITS} for more details. + * + * @param newPartialProgressMaxCommits to target number of the commits per run + */ + public Builder partialProgressMaxCommits(int newPartialProgressMaxCommits) { + this.partialProgressMaxCommits = newPartialProgressMaxCommits; + return this; + } + + /** + * Configures the maximum byte size of the rewrites for one scheduled compaction. This could be + * used to limit the resources used by the compaction. + * + * @param newMaxRewriteBytes to limit the size of the rewrites + */ + public Builder maxRewriteBytes(long newMaxRewriteBytes) { + this.maxRewriteBytes = newMaxRewriteBytes; + return this; + } + + /** + * Configures the target file size. See {@link + * org.apache.iceberg.actions.RewriteDataFiles#TARGET_FILE_SIZE_BYTES} for more details. + * + * @param targetFileSizeBytes target file size + */ + public Builder targetFileSizeBytes(long targetFileSizeBytes) { + this.rewriteOptions.put( + SizeBasedFileRewritePlanner.TARGET_FILE_SIZE_BYTES, String.valueOf(targetFileSizeBytes)); + return this; + } + + /** + * Configures the min file size considered for rewriting. See {@link + * SizeBasedFileRewritePlanner#MIN_FILE_SIZE_BYTES} for more details. + * + * @param minFileSizeBytes min file size + */ + public Builder minFileSizeBytes(long minFileSizeBytes) { + this.rewriteOptions.put( + SizeBasedFileRewritePlanner.MIN_FILE_SIZE_BYTES, String.valueOf(minFileSizeBytes)); + return this; + } + + /** + * Configures the max file size considered for rewriting. See {@link + * SizeBasedFileRewritePlanner#MAX_FILE_SIZE_BYTES} for more details. + * + * @param maxFileSizeBytes max file size + */ + public Builder maxFileSizeBytes(long maxFileSizeBytes) { + this.rewriteOptions.put( + SizeBasedFileRewritePlanner.MAX_FILE_SIZE_BYTES, String.valueOf(maxFileSizeBytes)); + return this; + } + + /** + * Configures the minimum file number after a rewrite is always initiated. See description see + * {@link SizeBasedFileRewritePlanner#MIN_INPUT_FILES} for more details. + * + * @param minInputFiles min file number + */ + public Builder minInputFiles(int minInputFiles) { + this.rewriteOptions.put( + SizeBasedFileRewritePlanner.MIN_INPUT_FILES, String.valueOf(minInputFiles)); + return this; + } + + /** + * Configures the minimum delete file number for a file after a rewrite is always initiated. See + * {@link BinPackRewriteFilePlanner#DELETE_FILE_THRESHOLD} for more details. + * + * @param deleteFileThreshold min delete file number + */ + public Builder deleteFileThreshold(int deleteFileThreshold) { + this.rewriteOptions.put( + BinPackRewriteFilePlanner.DELETE_FILE_THRESHOLD, String.valueOf(deleteFileThreshold)); + return this; + } + + /** + * Overrides other options and forces rewriting of all provided files. + * + * @param rewriteAll enables a full rewrite + */ + public Builder rewriteAll(boolean rewriteAll) { + this.rewriteOptions.put(SizeBasedFileRewritePlanner.REWRITE_ALL, String.valueOf(rewriteAll)); + return this; + } + + /** + * Configures the group size for rewriting. See {@link + * SizeBasedFileRewritePlanner#MAX_FILE_GROUP_SIZE_BYTES} for more details. + * + * @param maxFileGroupSizeBytes file group size for rewrite + */ + public Builder maxFileGroupSizeBytes(long maxFileGroupSizeBytes) { + this.rewriteOptions.put( + SizeBasedFileRewritePlanner.MAX_FILE_GROUP_SIZE_BYTES, + String.valueOf(maxFileGroupSizeBytes)); + return this; + } + + /** + * Configures max files to rewrite. See {@link BinPackRewriteFilePlanner#MAX_FILES_TO_REWRITE} + * for more details. + * + * @param maxFilesToRewrite maximum files to rewrite + */ + public Builder maxFilesToRewrite(int maxFilesToRewrite) { + this.rewriteOptions.put( + BinPackRewriteFilePlanner.MAX_FILES_TO_REWRITE, String.valueOf(maxFilesToRewrite)); + return this; + } + + /** + * A user provided filter for determining which files will be considered by the rewrite + * strategy. + * + * @param newFilter the filter expression to apply + * @return this for method chaining + */ + public Builder filter(Expression newFilter) { + this.filter = newFilter; + return this; + } + + /** + * Configures the properties for the rewriter. + * + * @param rewriteDataFilesConfig properties for the rewriter + */ + public Builder config(RewriteDataFilesConfig rewriteDataFilesConfig) { + + // Config about the rewriter + this.partialProgressEnabled(rewriteDataFilesConfig.partialProgressEnable()) + .partialProgressMaxCommits(rewriteDataFilesConfig.partialProgressMaxCommits()) + .maxRewriteBytes(rewriteDataFilesConfig.maxRewriteBytes()) + // Config about the schedule + .scheduleOnCommitCount(rewriteDataFilesConfig.scheduleOnCommitCount()) + .scheduleOnDataFileCount(rewriteDataFilesConfig.scheduleOnDataFileCount()) + .scheduleOnDataFileSize(rewriteDataFilesConfig.scheduleOnDataFileSize()) + .scheduleOnInterval( + Duration.ofSeconds(rewriteDataFilesConfig.scheduleOnIntervalSecond())); + + // override the rewrite options + this.rewriteOptions.putAll(rewriteDataFilesConfig.properties()); + + return this; + } + + /** + * The input is a {@link DataStream} with {@link Trigger} events and every event should be + * immediately followed by a {@link Watermark} with the same timestamp as the event. + * + *

The output is a {@link DataStream} with the {@link TaskResult} of the run followed by the + * {@link Watermark}. + */ + @Override + DataStream append(DataStream trigger) { + SingleOutputStreamOperator planned = + trigger + .process( + new DataFileRewritePlanner( + tableName(), + taskName(), + index(), + tableLoader(), + partialProgressEnabled ? partialProgressMaxCommits : 1, + maxRewriteBytes, + rewriteOptions, + filter)) + .name(operatorName(PLANNER_TASK_NAME)) + .uid(PLANNER_TASK_NAME + uidSuffix()) + .slotSharingGroup(slotSharingGroup()) + .forceNonParallel(); + + SingleOutputStreamOperator rewritten = + planned + .rebalance() + .process(new DataFileRewriteRunner(tableName(), taskName(), index())) + .name(operatorName(REWRITE_TASK_NAME)) + .uid(REWRITE_TASK_NAME + uidSuffix()) + .slotSharingGroup(slotSharingGroup()) + .setParallelism(parallelism()); + + SingleOutputStreamOperator updated = + rewritten + .transform( + operatorName(COMMIT_TASK_NAME), + TypeInformation.of(Trigger.class), + new DataFileRewriteCommitter(tableName(), taskName(), index(), tableLoader())) + .uid(COMMIT_TASK_NAME + uidSuffix()) + .slotSharingGroup(slotSharingGroup()) + .forceNonParallel(); + + return trigger + .union(updated) + .connect( + planned + .getSideOutput(TaskResultAggregator.ERROR_STREAM) + .union( + rewritten.getSideOutput(TaskResultAggregator.ERROR_STREAM), + updated.getSideOutput(TaskResultAggregator.ERROR_STREAM))) + .transform( + operatorName(AGGREGATOR_TASK_NAME), + TypeInformation.of(TaskResult.class), + new TaskResultAggregator(tableName(), taskName(), index())) + .uid(AGGREGATOR_TASK_NAME + uidSuffix()) + .slotSharingGroup(slotSharingGroup()) + .forceNonParallel(); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFilesConfig.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFilesConfig.java new file mode 100644 index 000000000000..b2fb83b75b86 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFilesConfig.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import java.util.Map; +import java.util.stream.Collectors; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.iceberg.Table; +import org.apache.iceberg.actions.RewriteDataFiles; +import org.apache.iceberg.flink.FlinkConfParser; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +public class RewriteDataFilesConfig { + public static final String PREFIX = FlinkMaintenanceConfig.PREFIX + "rewrite."; + + public static final String MAX_BYTES = PREFIX + "max-bytes"; + public static final ConfigOption MAX_BYTES_OPTION = + ConfigOptions.key(MAX_BYTES) + .longType() + .defaultValue(Long.MAX_VALUE) + .withDescription( + "The maximum number of bytes allowed for a rewrite operation. " + + "If the total size of data files exceeds this limit, the rewrites within one scheduled compaction " + + "will be limited in size to restrict the resources used by the compaction."); + + public static final ConfigOption PARTIAL_PROGRESS_MAX_COMMITS_OPTION = + ConfigOptions.key(PREFIX + RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS) + .intType() + .defaultValue(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS_DEFAULT) + .withDescription( + "The maximum number of commits allowed when partial progress is enabled. " + + "This configuration controls how many file groups " + + "are committed per run when partial progress is enabled."); + + public static final ConfigOption PARTIAL_PROGRESS_ENABLED_OPTION = + ConfigOptions.key(PREFIX + RewriteDataFiles.PARTIAL_PROGRESS_ENABLED) + .booleanType() + .defaultValue(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED_DEFAULT) + .withDescription( + "Whether to enable partial progress commits. " + + "When enabled, the rewrite operation will commit by file group, " + + "allowing progress even if some file groups fail to commit."); + + public static final String SCHEDULE_ON_COMMIT_COUNT = PREFIX + "schedule.commit-count"; + public static final ConfigOption SCHEDULE_ON_COMMIT_COUNT_OPTION = + ConfigOptions.key(SCHEDULE_ON_COMMIT_COUNT) + .intType() + .defaultValue(10) + .withDescription( + "The number of commits after which to trigger a new rewrite operation. " + + "This setting controls the frequency of rewrite operations."); + + public static final String SCHEDULE_ON_DATA_FILE_COUNT = PREFIX + "schedule.data-file-count"; + public static final ConfigOption SCHEDULE_ON_DATA_FILE_COUNT_OPTION = + ConfigOptions.key(SCHEDULE_ON_DATA_FILE_COUNT) + .intType() + .defaultValue(1000) + .withDescription("The number of data files that should trigger a new rewrite operation."); + + public static final String SCHEDULE_ON_DATA_FILE_SIZE = PREFIX + "schedule.data-file-size"; + public static final ConfigOption SCHEDULE_ON_DATA_FILE_SIZE_OPTION = + ConfigOptions.key(SCHEDULE_ON_DATA_FILE_SIZE) + .longType() + .defaultValue(100L * 1024 * 1024 * 1024) // Default is 100 GB + .withDescription( + "The total size of data files that should trigger a new rewrite operation."); + + public static final String SCHEDULE_ON_INTERVAL_SECOND = PREFIX + "schedule.interval-second"; + public static final ConfigOption SCHEDULE_ON_INTERVAL_SECOND_OPTION = + ConfigOptions.key(SCHEDULE_ON_INTERVAL_SECOND) + .longType() + .defaultValue(10 * 60L) // Default is 10 minutes + .withDescription( + "The time interval (in seconds) between two consecutive rewrite operations. " + + "This ensures periodic scheduling of rewrite tasks."); + + private final FlinkConfParser confParser; + private final Map writeProperties; + + public RewriteDataFilesConfig( + Table table, Map writeOptions, ReadableConfig readableConfig) { + this.writeProperties = writeOptions; + this.confParser = new FlinkConfParser(table, writeOptions, readableConfig); + } + + /** Gets the number of commits that trigger a rewrite operation. */ + public int scheduleOnCommitCount() { + return confParser + .intConf() + .option(SCHEDULE_ON_COMMIT_COUNT) + .flinkConfig(SCHEDULE_ON_COMMIT_COUNT_OPTION) + .defaultValue(SCHEDULE_ON_COMMIT_COUNT_OPTION.defaultValue()) + .parse(); + } + + /** Gets the number of data files that trigger a rewrite operation. */ + public int scheduleOnDataFileCount() { + return confParser + .intConf() + .option(SCHEDULE_ON_DATA_FILE_COUNT) + .flinkConfig(SCHEDULE_ON_DATA_FILE_COUNT_OPTION) + .defaultValue(SCHEDULE_ON_DATA_FILE_COUNT_OPTION.defaultValue()) + .parse(); + } + + /** Gets the total size of data files that trigger a rewrite operation. */ + public long scheduleOnDataFileSize() { + return confParser + .longConf() + .option(SCHEDULE_ON_DATA_FILE_SIZE) + .flinkConfig(SCHEDULE_ON_DATA_FILE_SIZE_OPTION) + .defaultValue(SCHEDULE_ON_DATA_FILE_SIZE_OPTION.defaultValue()) + .parse(); + } + + /** Gets the time interval (in seconds) between two consecutive rewrite operations. */ + public long scheduleOnIntervalSecond() { + return confParser + .longConf() + .option(SCHEDULE_ON_INTERVAL_SECOND) + .flinkConfig(SCHEDULE_ON_INTERVAL_SECOND_OPTION) + .defaultValue(SCHEDULE_ON_INTERVAL_SECOND_OPTION.defaultValue()) + .parse(); + } + + /** Gets whether partial progress commits are enabled. */ + public boolean partialProgressEnable() { + return confParser + .booleanConf() + .option(PARTIAL_PROGRESS_ENABLED_OPTION.key()) + .flinkConfig(PARTIAL_PROGRESS_ENABLED_OPTION) + .defaultValue(PARTIAL_PROGRESS_ENABLED_OPTION.defaultValue()) + .parse(); + } + + /** Gets the maximum number of commits allowed for partial progress. */ + public int partialProgressMaxCommits() { + return confParser + .intConf() + .option(PARTIAL_PROGRESS_MAX_COMMITS_OPTION.key()) + .flinkConfig(PARTIAL_PROGRESS_MAX_COMMITS_OPTION) + .defaultValue(PARTIAL_PROGRESS_MAX_COMMITS_OPTION.defaultValue()) + .parse(); + } + + /** Gets the maximum rewrite bytes allowed for a single rewrite operation. */ + public long maxRewriteBytes() { + return confParser + .longConf() + .option(MAX_BYTES) + .flinkConfig(MAX_BYTES_OPTION) + .defaultValue(MAX_BYTES_OPTION.defaultValue()) + .parse(); + } + + public Map properties() { + return writeProperties.entrySet().stream() + .filter(entry -> entry.getKey().startsWith(PREFIX)) + .collect( + Collectors.toMap( + entry -> entry.getKey().substring(PREFIX.length()), + Map.Entry::getValue, + (existing, replacement) -> existing, + Maps::newHashMap)); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TableMaintenance.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TableMaintenance.java new file mode 100644 index 000000000000..1a2b0607dd1e --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TableMaintenance.java @@ -0,0 +1,336 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import java.io.IOException; +import java.time.Duration; +import java.util.List; +import java.util.Locale; +import java.util.UUID; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.ExecutionConfig; +import org.apache.flink.api.common.eventtime.TimestampAssigner; +import org.apache.flink.api.common.eventtime.TimestampAssignerSupplier; +import org.apache.flink.api.common.eventtime.Watermark; +import org.apache.flink.api.common.eventtime.WatermarkGenerator; +import org.apache.flink.api.common.eventtime.WatermarkGeneratorSupplier; +import org.apache.flink.api.common.eventtime.WatermarkOutput; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.operators.util.OperatorValidationUtils; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy; +import org.apache.flink.api.dag.Transformation; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamUtils; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.graph.StreamGraphGenerator; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.maintenance.operator.LockRemover; +import org.apache.iceberg.flink.maintenance.operator.MonitorSource; +import org.apache.iceberg.flink.maintenance.operator.TableChange; +import org.apache.iceberg.flink.maintenance.operator.TriggerEvaluator; +import org.apache.iceberg.flink.maintenance.operator.TriggerManager; +import org.apache.iceberg.flink.sink.IcebergSink; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +/** Creates the table maintenance graph. */ +public class TableMaintenance { + static final String SOURCE_OPERATOR_NAME_PREFIX = "Monitor source for "; + static final String TRIGGER_MANAGER_OPERATOR_NAME = "Trigger manager"; + static final String WATERMARK_ASSIGNER_OPERATOR_NAME = "Watermark Assigner"; + static final String FILTER_OPERATOR_NAME_PREFIX = "Filter "; + static final String LOCK_REMOVER_OPERATOR_NAME = "Lock remover"; + + static final long RATE_LIMIT_SECOND_DEFAULT = 60; + static final long LOCK_CHECK_DELAY_SECOND_DEFAULT = 30; + static final int MAX_READ_BACK_DEFAULT = 100; + + private TableMaintenance() {} + + /** + * Use when the change stream is already provided, like in the {@link + * IcebergSink#addPostCommitTopology(DataStream)}. + * + * @param changeStream the table changes + * @param tableLoader used for accessing the table + * @param lockFactory used for preventing concurrent task runs + * @return builder for the maintenance stream + */ + @Internal + public static Builder forChangeStream( + DataStream changeStream, + TableLoader tableLoader, + TriggerLockFactory lockFactory) { + Preconditions.checkNotNull(changeStream, "The change stream should not be null"); + Preconditions.checkNotNull(tableLoader, "TableLoader should not be null"); + Preconditions.checkNotNull(lockFactory, "LockFactory should not be null"); + + return new Builder(null, changeStream, tableLoader, lockFactory); + } + + /** + * Use this for standalone maintenance job. It creates a monitor source that detect table changes + * and build the maintenance pipelines afterwards. + * + * @param env used to register the monitor source + * @param tableLoader used for accessing the table + * @param lockFactory used for preventing concurrent task runs + * @return builder for the maintenance stream + */ + public static Builder forTable( + StreamExecutionEnvironment env, TableLoader tableLoader, TriggerLockFactory lockFactory) { + Preconditions.checkNotNull(env, "StreamExecutionEnvironment should not be null"); + Preconditions.checkNotNull(tableLoader, "TableLoader should not be null"); + Preconditions.checkNotNull(lockFactory, "LockFactory should not be null"); + + return new Builder(env, null, tableLoader, lockFactory); + } + + public static class Builder { + private final StreamExecutionEnvironment env; + private final DataStream inputStream; + private final TableLoader tableLoader; + private final List> taskBuilders; + private final TriggerLockFactory lockFactory; + + private String uidSuffix = "TableMaintenance-" + UUID.randomUUID(); + private String slotSharingGroup = StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP; + private Duration rateLimit = Duration.ofSeconds(RATE_LIMIT_SECOND_DEFAULT); + private Duration lockCheckDelay = Duration.ofSeconds(LOCK_CHECK_DELAY_SECOND_DEFAULT); + private int parallelism = ExecutionConfig.PARALLELISM_DEFAULT; + private int maxReadBack = MAX_READ_BACK_DEFAULT; + + private Builder( + StreamExecutionEnvironment env, + DataStream inputStream, + TableLoader tableLoader, + TriggerLockFactory lockFactory) { + this.env = env; + this.inputStream = inputStream; + this.tableLoader = tableLoader; + this.lockFactory = lockFactory; + this.taskBuilders = Lists.newArrayListWithCapacity(4); + } + + /** + * The suffix used for the generated {@link Transformation}'s uid. + * + * @param newUidSuffix for the transformations + */ + public Builder uidSuffix(String newUidSuffix) { + this.uidSuffix = newUidSuffix; + return this; + } + + /** + * The {@link SingleOutputStreamOperator#slotSharingGroup(String)} for all the operators of the + * generated stream. Could be used to separate the resources used by this task. + * + * @param newSlotSharingGroup to be used for the operators + */ + public Builder slotSharingGroup(String newSlotSharingGroup) { + this.slotSharingGroup = newSlotSharingGroup; + return this; + } + + /** + * Limits the firing frequency for the task triggers. + * + * @param newRateLimit firing frequency + */ + public Builder rateLimit(Duration newRateLimit) { + Preconditions.checkNotNull(newRateLimit, "Rate limit should not be null"); + Preconditions.checkArgument( + newRateLimit.toMillis() > 0, "Rate limit should be greater than 0"); + this.rateLimit = newRateLimit; + return this; + } + + /** + * Sets the delay for checking lock availability when a concurrent run is detected. + * + * @param newLockCheckDelay lock checking frequency + */ + public Builder lockCheckDelay(Duration newLockCheckDelay) { + this.lockCheckDelay = newLockCheckDelay; + return this; + } + + /** + * Sets the default parallelism of maintenance tasks. Could be overwritten by the {@link + * MaintenanceTaskBuilder#parallelism(int)}. + * + * @param newParallelism task parallelism + */ + public Builder parallelism(int newParallelism) { + OperatorValidationUtils.validateParallelism(newParallelism); + this.parallelism = newParallelism; + return this; + } + + /** + * Maximum number of snapshots checked when started with an embedded {@link MonitorSource} at + * the first time. Only available when the {@link + * TableMaintenance#forTable(StreamExecutionEnvironment, TableLoader, TriggerLockFactory)} is + * used. + * + * @param newMaxReadBack snapshots to consider when initializing + */ + public Builder maxReadBack(int newMaxReadBack) { + Preconditions.checkArgument( + inputStream == null, "Can't set maxReadBack when change stream is provided"); + this.maxReadBack = newMaxReadBack; + return this; + } + + /** + * Adds a specific task with the given schedule. + * + * @param task to add + */ + public Builder add(MaintenanceTaskBuilder task) { + taskBuilders.add(task); + return this; + } + + /** Builds the task graph for the maintenance tasks. */ + public void append() throws IOException { + Preconditions.checkArgument(!taskBuilders.isEmpty(), "Provide at least one task"); + Preconditions.checkNotNull(uidSuffix, "Uid suffix should no be null"); + + List taskNames = Lists.newArrayListWithCapacity(taskBuilders.size()); + List evaluators = Lists.newArrayListWithCapacity(taskBuilders.size()); + for (int i = 0; i < taskBuilders.size(); ++i) { + taskNames.add(nameFor(taskBuilders.get(i), i)); + evaluators.add(taskBuilders.get(i).evaluator()); + } + + try (TableLoader loader = tableLoader.clone()) { + loader.open(); + String tableName = loader.loadTable().name(); + DataStream triggers = + DataStreamUtils.reinterpretAsKeyedStream( + changeStream(tableName, loader), unused -> true) + .process( + new TriggerManager( + loader, + lockFactory, + taskNames, + evaluators, + rateLimit.toMillis(), + lockCheckDelay.toMillis())) + .name(TRIGGER_MANAGER_OPERATOR_NAME) + .uid(TRIGGER_MANAGER_OPERATOR_NAME + uidSuffix) + .slotSharingGroup(slotSharingGroup) + .forceNonParallel() + .assignTimestampsAndWatermarks(new PunctuatedWatermarkStrategy()) + .name(WATERMARK_ASSIGNER_OPERATOR_NAME) + .uid(WATERMARK_ASSIGNER_OPERATOR_NAME + uidSuffix) + .slotSharingGroup(slotSharingGroup) + .forceNonParallel(); + + // Add the specific tasks + DataStream unioned = null; + for (int i = 0; i < taskBuilders.size(); ++i) { + int taskIndex = i; + DataStream filtered = + triggers + .filter(t -> t.taskId() != null && t.taskId() == taskIndex) + .name(FILTER_OPERATOR_NAME_PREFIX + taskIndex) + .forceNonParallel() + .uid(FILTER_OPERATOR_NAME_PREFIX + taskIndex + "-" + uidSuffix) + .slotSharingGroup(slotSharingGroup); + MaintenanceTaskBuilder builder = taskBuilders.get(taskIndex); + DataStream result = + builder.append( + filtered, + tableName, + taskNames.get(taskIndex), + taskIndex, + loader, + uidSuffix, + slotSharingGroup, + parallelism); + if (unioned == null) { + unioned = result; + } else { + unioned = unioned.union(result); + } + } + + // Add the LockRemover to the end + unioned + .transform( + LOCK_REMOVER_OPERATOR_NAME, + TypeInformation.of(Void.class), + new LockRemover(tableName, lockFactory, taskNames)) + .forceNonParallel() + .uid("lock-remover-" + uidSuffix) + .slotSharingGroup(slotSharingGroup); + } + } + + private DataStream changeStream(String tableName, TableLoader loader) { + if (inputStream == null) { + // Create a monitor source to provide the TableChange stream + MonitorSource source = + new MonitorSource( + loader, RateLimiterStrategy.perSecond(1.0 / rateLimit.getSeconds()), maxReadBack); + return env.fromSource( + source, WatermarkStrategy.noWatermarks(), SOURCE_OPERATOR_NAME_PREFIX + tableName) + .uid(SOURCE_OPERATOR_NAME_PREFIX + uidSuffix) + .slotSharingGroup(slotSharingGroup) + .forceNonParallel(); + } else { + return inputStream.global(); + } + } + + private static String nameFor(MaintenanceTaskBuilder streamBuilder, int taskIndex) { + return String.format(Locale.ROOT, "%s [%d]", streamBuilder.maintenanceTaskName(), taskIndex); + } + } + + @Internal + public static class PunctuatedWatermarkStrategy implements WatermarkStrategy { + @Override + public WatermarkGenerator createWatermarkGenerator( + WatermarkGeneratorSupplier.Context context) { + return new WatermarkGenerator<>() { + @Override + public void onEvent(Trigger event, long eventTimestamp, WatermarkOutput output) { + output.emitWatermark(new Watermark(event.timestamp())); + } + + @Override + public void onPeriodicEmit(WatermarkOutput output) { + // No periodic watermarks + } + }; + } + + @Override + public TimestampAssigner createTimestampAssigner( + TimestampAssignerSupplier.Context context) { + return (element, unused) -> element.timestamp(); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TaskResult.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TaskResult.java new file mode 100644 index 000000000000..ca1462526f13 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TaskResult.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import java.io.Serializable; +import java.util.List; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; + +/** The result of a single Maintenance Task. */ +public class TaskResult implements Serializable { + private final int taskIndex; + private final long startEpoch; + private final boolean success; + private final List exceptions; + + public TaskResult(int taskIndex, long startEpoch, boolean success, List exceptions) { + this.taskIndex = taskIndex; + this.startEpoch = startEpoch; + this.success = success; + this.exceptions = exceptions; + } + + public int taskIndex() { + return taskIndex; + } + + public long startEpoch() { + return startEpoch; + } + + public boolean success() { + return success; + } + + public List exceptions() { + return exceptions; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("taskIndex", taskIndex) + .add("startEpoch", startEpoch) + .add("success", success) + .add("exceptions", exceptions) + .toString(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/Trigger.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/Trigger.java new file mode 100644 index 000000000000..09209ba15153 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/Trigger.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; + +public class Trigger { + private final long timestamp; + private final Integer taskId; + private final boolean isRecovery; + + private Trigger(long timestamp, Integer taskId, boolean isRecovery) { + this.timestamp = timestamp; + this.taskId = taskId; + this.isRecovery = isRecovery; + } + + @Internal + public static Trigger create(long timestamp, int taskId) { + return new Trigger(timestamp, taskId, false); + } + + @Internal + public static Trigger recovery(long timestamp) { + return new Trigger(timestamp, null, true); + } + + public long timestamp() { + return timestamp; + } + + public Integer taskId() { + return taskId; + } + + public boolean isRecovery() { + return isRecovery; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("timestamp", timestamp) + .add("taskId", taskId) + .add("isRecovery", isRecovery) + .toString(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TriggerLockFactory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TriggerLockFactory.java new file mode 100644 index 000000000000..c31381355efe --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TriggerLockFactory.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import java.io.Closeable; +import java.io.Serializable; +import org.apache.flink.annotation.Experimental; +import org.apache.iceberg.flink.maintenance.operator.LockRemover; +import org.apache.iceberg.flink.maintenance.operator.TriggerManager; + +/** Lock interface for handling locks for the Flink Table Maintenance jobs. */ +@Experimental +public interface TriggerLockFactory extends Serializable, Closeable { + void open(); + + Lock createLock(); + + Lock createRecoveryLock(); + + interface Lock { + /** + * Tries to acquire a lock with a given key. Anyone already holding a lock would prevent + * acquiring this lock. Not reentrant. + * + *

Called by {@link TriggerManager}. Implementations could assume that are no concurrent + * calls for this method. + * + * @return true if the lock is acquired by this job, false if the lock + * is already held by someone + */ + boolean tryLock(); + + /** + * Checks if the lock is already taken. + * + * @return true if the lock is held by someone + */ + boolean isHeld(); + + /** + * Releases the lock. Should not fail if the lock is not held by anyone. + * + *

Called by {@link LockRemover}. Implementations could assume that are no concurrent calls + * for this method. + */ + void unlock(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ZkLockFactory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ZkLockFactory.java new file mode 100644 index 000000000000..539ba6b297c8 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ZkLockFactory.java @@ -0,0 +1,225 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import java.io.IOException; +import java.util.concurrent.TimeUnit; +import org.apache.flink.shaded.curator5.org.apache.curator.framework.CuratorFramework; +import org.apache.flink.shaded.curator5.org.apache.curator.framework.CuratorFrameworkFactory; +import org.apache.flink.shaded.curator5.org.apache.curator.framework.recipes.shared.SharedCount; +import org.apache.flink.shaded.curator5.org.apache.curator.framework.recipes.shared.VersionedValue; +import org.apache.flink.shaded.curator5.org.apache.curator.retry.ExponentialBackoffRetry; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Zookeeper backed implementation of the {@link TriggerLockFactory}. */ +public class ZkLockFactory implements TriggerLockFactory { + private static final Logger LOG = LoggerFactory.getLogger(ZkLockFactory.class); + + private static final String LOCK_BASE_PATH = "/iceberg/flink/maintenance/locks/"; + + private final String connectString; + private final String lockId; + private final int sessionTimeoutMs; + private final int connectionTimeoutMs; + private final int baseSleepTimeMs; + private final int maxRetries; + private transient CuratorFramework client; + private transient SharedCount taskSharedCount; + private transient SharedCount recoverySharedCount; + private volatile boolean isOpen; + + /** + * Create Zookeeper lock factory + * + * @param connectString Zookeeper connection string + * @param lockId which should identify the job and the table + * @param sessionTimeoutMs Session timeout in milliseconds + * @param connectionTimeoutMs Connection timeout in milliseconds + * @param baseSleepTimeMs Base sleep time in milliseconds + * @param maxRetries Maximum number of retries + */ + public ZkLockFactory( + String connectString, + String lockId, + int sessionTimeoutMs, + int connectionTimeoutMs, + int baseSleepTimeMs, + int maxRetries) { + Preconditions.checkNotNull(connectString, "Zookeeper connection string cannot be null"); + Preconditions.checkNotNull(lockId, "Lock ID cannot be null"); + Preconditions.checkArgument( + sessionTimeoutMs >= 0, "Session timeout must be positive, got: %s", sessionTimeoutMs); + Preconditions.checkArgument( + connectionTimeoutMs >= 0, + "Connection timeout must be positive, got: %s", + connectionTimeoutMs); + Preconditions.checkArgument( + baseSleepTimeMs >= 0, "Base sleep time must be positive, got: %s", baseSleepTimeMs); + Preconditions.checkArgument( + maxRetries >= 0, "Max retries must be non-negative, got: %s", maxRetries); + this.connectString = connectString; + this.lockId = lockId; + this.sessionTimeoutMs = sessionTimeoutMs; + this.connectionTimeoutMs = connectionTimeoutMs; + this.baseSleepTimeMs = baseSleepTimeMs; + this.maxRetries = maxRetries; + } + + @Override + public void open() { + if (isOpen) { + LOG.debug("ZkLockFactory already opened for lockId: {}.", lockId); + return; + } + + this.client = + CuratorFrameworkFactory.builder() + .connectString(connectString) + .sessionTimeoutMs(sessionTimeoutMs) + .connectionTimeoutMs(connectionTimeoutMs) + .retryPolicy(new ExponentialBackoffRetry(baseSleepTimeMs, maxRetries)) + .build(); + client.start(); + + try { + if (!client.blockUntilConnected(connectionTimeoutMs, TimeUnit.MILLISECONDS)) { + throw new IllegalStateException("Connection to Zookeeper timed out"); + } + + this.taskSharedCount = new SharedCount(client, getTaskSharePath(), 0); + this.recoverySharedCount = new SharedCount(client, getRecoverySharedPath(), 0); + taskSharedCount.start(); + recoverySharedCount.start(); + isOpen = true; + LOG.info("ZkLockFactory initialized for lockId: {}.", lockId); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted while connecting to Zookeeper", e); + } catch (Exception e) { + closeQuietly(); + throw new RuntimeException("Failed to initialize SharedCount", e); + } + } + + private String getTaskSharePath() { + return LOCK_BASE_PATH + lockId + "/task"; + } + + private String getRecoverySharedPath() { + return LOCK_BASE_PATH + lockId + "/recovery"; + } + + private void closeQuietly() { + try { + close(); + } catch (Exception e) { + LOG.warn("Failed to close ZkLockFactory for lockId: {}", lockId, e); + } + } + + @Override + public Lock createLock() { + return new ZkLock(getTaskSharePath(), taskSharedCount); + } + + @Override + public Lock createRecoveryLock() { + return new ZkLock(getRecoverySharedPath(), recoverySharedCount); + } + + @Override + public void close() throws IOException { + try { + if (taskSharedCount != null) { + taskSharedCount.close(); + } + + if (recoverySharedCount != null) { + recoverySharedCount.close(); + } + } finally { + if (client != null) { + client.close(); + } + + isOpen = false; + } + } + + /** Zookeeper lock implementation */ + private static class ZkLock implements Lock { + private final SharedCount sharedCount; + private final String lockPath; + + private static final int LOCKED = 1; + private static final int UNLOCKED = 0; + + private ZkLock(String lockPath, SharedCount sharedCount) { + this.lockPath = lockPath; + this.sharedCount = sharedCount; + } + + @Override + public boolean tryLock() { + VersionedValue versionedValue = sharedCount.getVersionedValue(); + if (isHeld(versionedValue)) { + LOG.debug("Lock is already held for path: {}", lockPath); + return false; + } + + try { + boolean acquired = sharedCount.trySetCount(versionedValue, LOCKED); + if (!acquired) { + LOG.debug("Failed to acquire lock for path: {}", lockPath); + } + + return acquired; + } catch (Exception e) { + LOG.warn("Failed to acquire Zookeeper lock", e); + return false; + } + } + + @Override + public boolean isHeld() { + return isHeld(sharedCount.getVersionedValue()); + } + + private static boolean isHeld(VersionedValue versionedValue) { + try { + return versionedValue.getValue() == LOCKED; + } catch (Exception e) { + throw new RuntimeException("Failed to check Zookeeper lock status", e); + } + } + + @Override + public void unlock() { + try { + sharedCount.setCount(UNLOCKED); + LOG.debug("Released lock for path: {}", lockPath); + } catch (Exception e) { + LOG.warn("Failed to release lock for path: {}", lockPath, e); + throw new RuntimeException("Failed to release lock", e); + } + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteCommitter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteCommitter.java new file mode 100644 index 000000000000..135d3d9b42db --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteCommitter.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.io.IOException; +import java.util.Set; +import org.apache.flink.annotation.Internal; +import org.apache.flink.metrics.Counter; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.Table; +import org.apache.iceberg.actions.RewriteDataFilesCommitManager; +import org.apache.iceberg.actions.RewriteDataFilesCommitManager.CommitService; +import org.apache.iceberg.actions.RewriteFileGroup; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Commits the rewrite changes using {@link RewriteDataFilesCommitManager}. The input is a {@link + * DataFileRewriteRunner.ExecutedGroup}. Only {@link Watermark} is emitted which is chained to + * {@link TaskResultAggregator} input 1. + */ +@Internal +public class DataFileRewriteCommitter extends AbstractStreamOperator + implements OneInputStreamOperator { + private static final Logger LOG = LoggerFactory.getLogger(DataFileRewriteCommitter.class); + + private final String tableName; + private final String taskName; + private final int taskIndex; + private final TableLoader tableLoader; + + private transient Table table; + private transient CommitService commitService; + private transient Counter errorCounter; + private transient Counter addedDataFileNumCounter; + private transient Counter addedDataFileSizeCounter; + private transient Counter removedDataFileNumCounter; + private transient Counter removedDataFileSizeCounter; + + public DataFileRewriteCommitter( + String tableName, String taskName, int taskIndex, TableLoader tableLoader) { + Preconditions.checkNotNull(tableName, "Table name should no be null"); + Preconditions.checkNotNull(taskName, "Task name should no be null"); + Preconditions.checkNotNull(tableLoader, "Table loader should no be null"); + + this.tableName = tableName; + this.taskName = taskName; + this.taskIndex = taskIndex; + this.tableLoader = tableLoader; + } + + @Override + public void open() throws Exception { + super.open(); + + tableLoader.open(); + this.table = tableLoader.loadTable(); + + MetricGroup taskMetricGroup = + TableMaintenanceMetrics.groupFor(getRuntimeContext(), tableName, taskName, taskIndex); + this.errorCounter = taskMetricGroup.counter(TableMaintenanceMetrics.ERROR_COUNTER); + this.addedDataFileNumCounter = + taskMetricGroup.counter(TableMaintenanceMetrics.ADDED_DATA_FILE_NUM_METRIC); + this.addedDataFileSizeCounter = + taskMetricGroup.counter(TableMaintenanceMetrics.ADDED_DATA_FILE_SIZE_METRIC); + this.removedDataFileNumCounter = + taskMetricGroup.counter(TableMaintenanceMetrics.REMOVED_DATA_FILE_NUM_METRIC); + this.removedDataFileSizeCounter = + taskMetricGroup.counter(TableMaintenanceMetrics.REMOVED_DATA_FILE_SIZE_METRIC); + } + + @Override + public void processElement(StreamRecord streamRecord) { + DataFileRewriteRunner.ExecutedGroup executedGroup = streamRecord.getValue(); + try { + if (commitService == null) { + // Refresh the table to get the latest snapshot for the committer + table.refresh(); + + FlinkRewriteDataFilesCommitManager commitManager = + new FlinkRewriteDataFilesCommitManager( + table, executedGroup.snapshotId(), streamRecord.getTimestamp()); + this.commitService = commitManager.service(executedGroup.groupsPerCommit()); + commitService.start(); + } + + commitService.offer(executedGroup.group()); + } catch (Exception e) { + LOG.warn( + DataFileRewritePlanner.MESSAGE_PREFIX + "Exception processing {}", + tableName, + taskName, + taskIndex, + streamRecord.getTimestamp(), + executedGroup, + e); + output.collect(TaskResultAggregator.ERROR_STREAM, new StreamRecord<>(e)); + errorCounter.inc(); + } + } + + @Override + public void processWatermark(Watermark mark) throws Exception { + try { + if (commitService != null) { + commitService.close(); + } + + LOG.info( + DataFileRewritePlanner.MESSAGE_PREFIX + "Successfully completed data file compaction", + tableName, + taskName, + taskIndex, + mark.getTimestamp()); + } catch (Exception e) { + LOG.warn( + DataFileRewritePlanner.MESSAGE_PREFIX + "Exception closing commit service", + tableName, + taskName, + taskIndex, + mark.getTimestamp(), + e); + output.collect(TaskResultAggregator.ERROR_STREAM, new StreamRecord<>(e)); + errorCounter.inc(); + } + + // Cleanup + this.commitService = null; + + super.processWatermark(mark); + } + + @Override + public void close() throws IOException { + if (commitService != null) { + commitService.close(); + } + } + + private class FlinkRewriteDataFilesCommitManager extends RewriteDataFilesCommitManager { + private final long timestamp; + + FlinkRewriteDataFilesCommitManager(Table table, long startingSnapshotId, long timestamp) { + super(table, startingSnapshotId); + this.timestamp = timestamp; + } + + @Override + public void commitFileGroups(Set fileGroups) { + super.commitFileGroups(fileGroups); + LOG.info( + DataFileRewritePlanner.MESSAGE_PREFIX + "Committed {}", + tableName, + taskName, + taskIndex, + timestamp, + fileGroups); + updateMetrics(fileGroups); + } + + private void updateMetrics(Set fileGroups) { + for (RewriteFileGroup fileGroup : fileGroups) { + for (DataFile added : fileGroup.addedFiles()) { + addedDataFileNumCounter.inc(); + addedDataFileSizeCounter.inc(added.fileSizeInBytes()); + } + + for (DataFile rewritten : fileGroup.rewrittenFiles()) { + removedDataFileNumCounter.inc(); + removedDataFileSizeCounter.inc(rewritten.fileSizeInBytes()); + } + } + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java new file mode 100644 index 000000000000..81db62e8bf25 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.math.RoundingMode; +import java.util.List; +import java.util.Map; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.metrics.Counter; +import org.apache.flink.streaming.api.functions.ProcessFunction; +import org.apache.flink.util.Collector; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableUtil; +import org.apache.iceberg.actions.BinPackRewriteFilePlanner; +import org.apache.iceberg.actions.FileRewritePlan; +import org.apache.iceberg.actions.RewriteDataFiles; +import org.apache.iceberg.actions.RewriteFileGroup; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.math.IntMath; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Plans the rewrite groups using the {@link BinPackRewriteFilePlanner}. The input is the {@link + * Trigger}, the output is zero, or some {@link PlannedGroup}s. + */ +@Internal +public class DataFileRewritePlanner + extends ProcessFunction { + static final String MESSAGE_PREFIX = "[For table {} with {}[{}] at {}]: "; + private static final Logger LOG = LoggerFactory.getLogger(DataFileRewritePlanner.class); + + private final String tableName; + private final String taskName; + private final int taskIndex; + private final TableLoader tableLoader; + private final int partialProgressMaxCommits; + private final long maxRewriteBytes; + private final Map rewriterOptions; + private transient Counter errorCounter; + private final Expression filter; + + public DataFileRewritePlanner( + String tableName, + String taskName, + int taskIndex, + TableLoader tableLoader, + int newPartialProgressMaxCommits, + long maxRewriteBytes, + Map rewriterOptions, + Expression filter) { + + Preconditions.checkNotNull(tableName, "Table name should no be null"); + Preconditions.checkNotNull(taskName, "Task name should no be null"); + Preconditions.checkNotNull(tableLoader, "Table loader should no be null"); + Preconditions.checkNotNull(rewriterOptions, "Options map should no be null"); + + this.tableName = tableName; + this.taskName = taskName; + this.taskIndex = taskIndex; + this.tableLoader = tableLoader; + this.partialProgressMaxCommits = newPartialProgressMaxCommits; + this.maxRewriteBytes = maxRewriteBytes; + this.rewriterOptions = rewriterOptions; + this.filter = filter; + } + + @Override + public void open(OpenContext context) throws Exception { + tableLoader.open(); + Table table = tableLoader.loadTable(); + Preconditions.checkArgument( + !TableUtil.supportsRowLineage(table), + "Flink does not support compaction on row lineage enabled tables (V3+)"); + this.errorCounter = + TableMaintenanceMetrics.groupFor(getRuntimeContext(), tableName, taskName, taskIndex) + .counter(TableMaintenanceMetrics.ERROR_COUNTER); + } + + @Override + public void processElement(Trigger value, Context ctx, Collector out) + throws Exception { + LOG.info( + DataFileRewritePlanner.MESSAGE_PREFIX + "Creating rewrite plan", + tableName, + taskName, + taskIndex, + ctx.timestamp()); + try { + SerializableTable table = + (SerializableTable) SerializableTable.copyOf(tableLoader.loadTable()); + if (table.currentSnapshot() == null) { + LOG.info( + DataFileRewritePlanner.MESSAGE_PREFIX + "Nothing to plan for in an empty table", + tableName, + taskName, + taskIndex, + ctx.timestamp()); + return; + } + + BinPackRewriteFilePlanner planner = new BinPackRewriteFilePlanner(table, filter); + planner.init(rewriterOptions); + + FileRewritePlan + plan = planner.plan(); + + long rewriteBytes = 0; + List groups = Lists.newArrayList(); + for (CloseableIterator groupIterator = plan.groups().iterator(); + groupIterator.hasNext(); ) { + RewriteFileGroup group = groupIterator.next(); + if (rewriteBytes + group.inputFilesSizeInBytes() > maxRewriteBytes) { + // Keep going, maybe some other group might fit in + LOG.info( + DataFileRewritePlanner.MESSAGE_PREFIX + + "Skipping group as max rewrite size reached {}", + tableName, + taskName, + taskIndex, + ctx.timestamp(), + group); + } else { + rewriteBytes += group.inputFilesSizeInBytes(); + groups.add(group); + } + } + + int groupsPerCommit = + IntMath.divide(groups.size(), partialProgressMaxCommits, RoundingMode.CEILING); + + LOG.info( + DataFileRewritePlanner.MESSAGE_PREFIX + "Rewrite plan created {}", + tableName, + taskName, + taskIndex, + ctx.timestamp(), + groups); + + for (RewriteFileGroup group : groups) { + LOG.info( + DataFileRewritePlanner.MESSAGE_PREFIX + "Emitting {}", + tableName, + taskName, + taskIndex, + ctx.timestamp(), + group); + out.collect(new PlannedGroup(table, groupsPerCommit, group)); + } + } catch (Exception e) { + LOG.warn( + DataFileRewritePlanner.MESSAGE_PREFIX + "Failed to plan data file rewrite groups", + tableName, + taskName, + taskIndex, + ctx.timestamp(), + e); + ctx.output(TaskResultAggregator.ERROR_STREAM, e); + errorCounter.inc(); + } + } + + @Override + public void close() throws Exception { + super.close(); + tableLoader.close(); + } + + public static class PlannedGroup { + private final SerializableTable table; + private final int groupsPerCommit; + private final RewriteFileGroup group; + + private PlannedGroup(SerializableTable table, int groupsPerCommit, RewriteFileGroup group) { + this.table = table; + this.groupsPerCommit = groupsPerCommit; + this.group = group; + } + + SerializableTable table() { + return table; + } + + int groupsPerCommit() { + return groupsPerCommit; + } + + RewriteFileGroup group() { + return group; + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteRunner.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteRunner.java new file mode 100644 index 000000000000..ad3b0454008c --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteRunner.java @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; + +import java.util.Collections; +import java.util.Set; +import org.apache.flink.annotation.Internal; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.metrics.Counter; +import org.apache.flink.streaming.api.functions.ProcessFunction; +import org.apache.flink.table.data.RowData; +import org.apache.flink.util.Collector; +import org.apache.iceberg.BaseCombinedScanTask; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.actions.RewriteFileGroup; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.maintenance.operator.DataFileRewritePlanner.PlannedGroup; +import org.apache.iceberg.flink.sink.RowDataTaskWriterFactory; +import org.apache.iceberg.flink.sink.TaskWriterFactory; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.FileScanTaskReader; +import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.util.PropertyUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Executes a rewrite for a single {@link PlannedGroup}. Reads the files with the standard {@link + * FileScanTaskReader}, so the delete files are considered, and writes using the {@link + * TaskWriterFactory}. The output is an {@link ExecutedGroup}. + */ +@Internal +public class DataFileRewriteRunner + extends ProcessFunction { + private static final Logger LOG = LoggerFactory.getLogger(DataFileRewriteRunner.class); + + private final String tableName; + private final String taskName; + private final int taskIndex; + + private transient int subTaskId; + private transient int attemptId; + private transient Counter errorCounter; + + public DataFileRewriteRunner(String tableName, String taskName, int taskIndex) { + Preconditions.checkNotNull(tableName, "Table name should no be null"); + Preconditions.checkNotNull(taskName, "Task name should no be null"); + this.tableName = tableName; + this.taskName = taskName; + this.taskIndex = taskIndex; + } + + @Override + public void open(OpenContext context) { + this.errorCounter = + TableMaintenanceMetrics.groupFor(getRuntimeContext(), tableName, taskName, taskIndex) + .counter(TableMaintenanceMetrics.ERROR_COUNTER); + + this.subTaskId = getRuntimeContext().getTaskInfo().getIndexOfThisSubtask(); + this.attemptId = getRuntimeContext().getTaskInfo().getAttemptNumber(); + } + + @Override + public void processElement(PlannedGroup value, Context ctx, Collector out) + throws Exception { + if (LOG.isDebugEnabled()) { + LOG.debug( + DataFileRewritePlanner.MESSAGE_PREFIX + "Rewriting files for group {} with files: {}", + tableName, + taskName, + taskIndex, + ctx.timestamp(), + value.group().info(), + value.group().rewrittenFiles()); + } else { + LOG.info( + DataFileRewritePlanner.MESSAGE_PREFIX + + "Rewriting files for group {} with {} number of files", + tableName, + taskName, + taskIndex, + ctx.timestamp(), + value.group().info(), + value.group().rewrittenFiles().size()); + } + + try (TaskWriter writer = writerFor(value)) { + try (DataIterator iterator = readerFor(value)) { + while (iterator.hasNext()) { + writer.write(iterator.next()); + } + + Set dataFiles = Sets.newHashSet(writer.dataFiles()); + value.group().setOutputFiles(dataFiles); + out.collect( + new ExecutedGroup( + value.table().currentSnapshot().snapshotId(), + value.groupsPerCommit(), + value.group())); + if (LOG.isDebugEnabled()) { + LOG.debug( + DataFileRewritePlanner.MESSAGE_PREFIX + "Rewritten files {} from {} to {}", + tableName, + taskName, + taskIndex, + ctx.timestamp(), + value.group().info(), + value.group().rewrittenFiles(), + value.group().addedFiles()); + } else { + LOG.info( + DataFileRewritePlanner.MESSAGE_PREFIX + "Rewritten {} files to {} files", + tableName, + taskName, + taskIndex, + ctx.timestamp(), + value.group().rewrittenFiles().size(), + value.group().addedFiles().size()); + } + } catch (Exception ex) { + LOG.info( + DataFileRewritePlanner.MESSAGE_PREFIX + "Exception rewriting datafile group {}", + tableName, + taskName, + taskIndex, + ctx.timestamp(), + value.group(), + ex); + ctx.output(TaskResultAggregator.ERROR_STREAM, ex); + errorCounter.inc(); + abort(writer, ctx.timestamp()); + } + } catch (Exception ex) { + LOG.info( + DataFileRewritePlanner.MESSAGE_PREFIX + + "Exception creating compaction writer for group {}", + tableName, + taskName, + taskIndex, + ctx.timestamp(), + value.group(), + ex); + ctx.output(TaskResultAggregator.ERROR_STREAM, ex); + errorCounter.inc(); + } + } + + private TaskWriter writerFor(PlannedGroup value) { + String formatString = + PropertyUtil.propertyAsString( + value.table().properties(), + TableProperties.DEFAULT_FILE_FORMAT, + TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); + RowDataTaskWriterFactory factory = + new RowDataTaskWriterFactory( + value.table(), + FlinkSchemaUtil.convert(value.table().schema()), + value.group().inputSplitSize(), + FileFormat.fromString(formatString), + value.table().properties(), + null, + false); + factory.initialize(subTaskId, attemptId); + return factory.create(); + } + + private DataIterator readerFor(PlannedGroup value) { + RowDataFileScanTaskReader reader = + new RowDataFileScanTaskReader( + value.table().schema(), + value.table().schema(), + PropertyUtil.propertyAsString(value.table().properties(), DEFAULT_NAME_MAPPING, null), + false, + Collections.emptyList()); + return new DataIterator<>( + reader, + new BaseCombinedScanTask(value.group().fileScanTasks()), + value.table().io(), + value.table().encryption()); + } + + private void abort(TaskWriter writer, long timestamp) { + try { + LOG.info( + DataFileRewritePlanner.MESSAGE_PREFIX + + "Aborting rewrite for (subTaskId {}, attemptId {})", + tableName, + taskName, + taskIndex, + timestamp, + subTaskId, + attemptId); + writer.abort(); + } catch (Exception e) { + LOG.info( + DataFileRewritePlanner.MESSAGE_PREFIX + "Exception in abort", + tableName, + taskName, + taskIndex, + timestamp, + e); + } + } + + public static class ExecutedGroup { + private final long snapshotId; + private final int groupsPerCommit; + private final RewriteFileGroup group; + + @VisibleForTesting + ExecutedGroup(long snapshotId, int groupsPerCommit, RewriteFileGroup group) { + this.snapshotId = snapshotId; + this.groupsPerCommit = groupsPerCommit; + this.group = group; + } + + long snapshotId() { + return snapshotId; + } + + int groupsPerCommit() { + return groupsPerCommit; + } + + RewriteFileGroup group() { + return group; + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DeleteFilesProcessor.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DeleteFilesProcessor.java new file mode 100644 index 000000000000..9189f5f018a8 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DeleteFilesProcessor.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.util.Set; +import org.apache.flink.annotation.Internal; +import org.apache.flink.metrics.Counter; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.iceberg.Table; +import org.apache.iceberg.io.BulkDeletionFailureException; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.SupportsBulkOperations; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Delete the files using the {@link FileIO} which implements {@link SupportsBulkOperations}. */ +@Internal +public class DeleteFilesProcessor extends AbstractStreamOperator + implements OneInputStreamOperator { + private static final Logger LOG = LoggerFactory.getLogger(DeleteFilesProcessor.class); + + private final String tableName; + private final String taskName; + private final int taskIndex; + private final SupportsBulkOperations io; + private final Set filesToDelete = Sets.newHashSet(); + private final int batchSize; + + private transient Counter failedCounter; + private transient Counter succeededCounter; + + public DeleteFilesProcessor(Table table, String taskName, int taskIndex, int batchSize) { + Preconditions.checkNotNull(taskName, "Task name should no be null"); + Preconditions.checkNotNull(table, "Table should no be null"); + + FileIO fileIO = table.io(); + Preconditions.checkArgument( + fileIO instanceof SupportsBulkOperations, + "%s doesn't support bulk delete", + fileIO.getClass().getSimpleName()); + + this.tableName = table.name(); + this.taskName = taskName; + this.taskIndex = taskIndex; + this.io = (SupportsBulkOperations) fileIO; + this.batchSize = batchSize; + } + + @Override + public void open() throws Exception { + MetricGroup taskMetricGroup = + TableMaintenanceMetrics.groupFor(getRuntimeContext(), tableName, taskName, taskIndex); + this.failedCounter = + taskMetricGroup.counter(TableMaintenanceMetrics.DELETE_FILE_FAILED_COUNTER); + this.succeededCounter = + taskMetricGroup.counter(TableMaintenanceMetrics.DELETE_FILE_SUCCEEDED_COUNTER); + } + + @Override + public void processElement(StreamRecord element) throws Exception { + if (element.isRecord()) { + filesToDelete.add(element.getValue()); + } + + if (filesToDelete.size() >= batchSize) { + deleteFiles(); + } + } + + @Override + public void processWatermark(Watermark mark) { + deleteFiles(); + } + + @Override + public void prepareSnapshotPreBarrier(long checkpointId) { + deleteFiles(); + } + + private void deleteFiles() { + try { + io.deleteFiles(filesToDelete); + LOG.info( + "Deleted {} files from table {} using bulk deletes", filesToDelete.size(), tableName); + succeededCounter.inc(filesToDelete.size()); + filesToDelete.clear(); + } catch (BulkDeletionFailureException e) { + int deletedFilesCount = filesToDelete.size() - e.numberFailedObjects(); + LOG.warn( + "Deleted only {} of {} files from table {} using bulk deletes", + deletedFilesCount, + filesToDelete.size(), + tableName, + e); + succeededCounter.inc(deletedFilesCount); + failedCounter.inc(e.numberFailedObjects()); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ExpireSnapshotsProcessor.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ExpireSnapshotsProcessor.java new file mode 100644 index 000000000000..2db9585ebd8a --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ExpireSnapshotsProcessor.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.util.Collections; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.streaming.api.functions.ProcessFunction; +import org.apache.flink.util.Collector; +import org.apache.flink.util.OutputTag; +import org.apache.iceberg.ExpireSnapshots; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.maintenance.api.TaskResult; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.ThreadPools; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Calls the {@link ExpireSnapshots} to remove the old snapshots and emits the filenames which could + * be removed in the {@link #DELETE_STREAM} side output. + */ +@Internal +public class ExpireSnapshotsProcessor extends ProcessFunction { + private static final Logger LOG = LoggerFactory.getLogger(ExpireSnapshotsProcessor.class); + public static final OutputTag DELETE_STREAM = + new OutputTag<>("expire-snapshots-file-deletes-stream", Types.STRING); + + private final TableLoader tableLoader; + private final Long maxSnapshotAgeMs; + private final Integer numSnapshots; + private final Integer plannerPoolSize; + private final Boolean cleanExpiredMetadata; + private transient ExecutorService plannerPool; + private transient Table table; + + public ExpireSnapshotsProcessor( + TableLoader tableLoader, + Long maxSnapshotAgeMs, + Integer numSnapshots, + Integer plannerPoolSize, + Boolean cleanExpiredMetadata) { + Preconditions.checkNotNull(tableLoader, "Table loader should not be null"); + + this.tableLoader = tableLoader; + this.maxSnapshotAgeMs = maxSnapshotAgeMs; + this.numSnapshots = numSnapshots; + this.plannerPoolSize = plannerPoolSize; + this.cleanExpiredMetadata = cleanExpiredMetadata; + } + + @Override + public void open(OpenContext parameters) throws Exception { + tableLoader.open(); + this.table = tableLoader.loadTable(); + this.plannerPool = + plannerPoolSize != null + ? ThreadPools.newFixedThreadPool(table.name() + "-table--planner", plannerPoolSize) + : ThreadPools.getWorkerPool(); + } + + @Override + public void processElement(Trigger trigger, Context ctx, Collector out) + throws Exception { + try { + table.refresh(); + ExpireSnapshots expireSnapshots = table.expireSnapshots(); + if (maxSnapshotAgeMs != null) { + expireSnapshots = expireSnapshots.expireOlderThan(ctx.timestamp() - maxSnapshotAgeMs); + } + + if (numSnapshots != null) { + expireSnapshots = expireSnapshots.retainLast(numSnapshots); + } + + if (cleanExpiredMetadata != null) { + expireSnapshots.cleanExpiredMetadata(cleanExpiredMetadata); + } + + AtomicLong deleteFileCounter = new AtomicLong(0L); + expireSnapshots + .planWith(plannerPool) + .deleteWith( + file -> { + ctx.output(DELETE_STREAM, file); + deleteFileCounter.incrementAndGet(); + }) + .cleanExpiredFiles(true) + .commit(); + + LOG.info( + "Successfully finished expiring snapshots for {} at {}. Scheduled {} files for delete.", + table, + ctx.timestamp(), + deleteFileCounter.get()); + out.collect( + new TaskResult(trigger.taskId(), trigger.timestamp(), true, Collections.emptyList())); + } catch (Exception e) { + LOG.error("Failed to expiring snapshots for {} at {}", table, ctx.timestamp(), e); + out.collect( + new TaskResult(trigger.taskId(), trigger.timestamp(), false, Lists.newArrayList(e))); + } + } + + @Override + public void close() throws Exception { + super.close(); + + tableLoader.close(); + if (plannerPoolSize != null) { + plannerPool.shutdown(); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileNameReader.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileNameReader.java new file mode 100644 index 000000000000..98610346aa18 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileNameReader.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.data.RowData; +import org.apache.flink.util.Collector; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.source.ScanContext; + +/** A specialized reader implementation that extracts file names from Iceberg table rows. */ +@Internal +public class FileNameReader extends TableReader { + + public FileNameReader( + String taskName, + int taskIndex, + TableLoader tableLoader, + Schema projectedSchema, + ScanContext scanContext, + MetadataTableType metadataTableType) { + super(taskName, taskIndex, tableLoader, projectedSchema, scanContext, metadataTableType); + } + + @Override + void extract(RowData rowData, Collector out) { + if (rowData != null && rowData.getString(0) != null) { + out.collect(rowData.getString(0).toString()); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileUriKeySelector.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileUriKeySelector.java new file mode 100644 index 000000000000..0ccf6a6ff08a --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileUriKeySelector.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.util.Map; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.actions.FileURI; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A key selector implementation that extracts a normalized file path from a file URI string. + * + *

This selector groups file URIs by their normalized path, ignoring differences in scheme and + * authority that are considered equivalent according to the provided mappings. + */ +@Internal +public class FileUriKeySelector implements KeySelector { + private static final Logger LOG = LoggerFactory.getLogger(FileUriKeySelector.class); + + static final String INVALID_URI = "__INVALID_URI__"; + + private final Map equalSchemes; + private final Map equalAuthorities; + + public FileUriKeySelector( + Map equalSchemes, Map equalAuthorities) { + this.equalSchemes = equalSchemes; + this.equalAuthorities = equalAuthorities; + } + + @Override + public String getKey(String value) throws Exception { + try { + FileURI fileUri = new FileURI(new Path(value).toUri(), equalSchemes, equalAuthorities); + return fileUri.getPath(); + } catch (Exception e) { + LOG.warn("Uri convert to FileURI error! Uri is {}.", value, e); + return INVALID_URI; + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListFileSystemFiles.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListFileSystemFiles.java new file mode 100644 index 000000000000..1db95be8d3b6 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListFileSystemFiles.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.util.Map; +import java.util.function.Predicate; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.metrics.Counter; +import org.apache.flink.streaming.api.functions.ProcessFunction; +import org.apache.flink.util.Collector; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.FileInfo; +import org.apache.iceberg.io.SupportsPrefixOperations; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.util.FileSystemWalker; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Recursively lists the files in the `location` directory. Hidden files, and files younger than the + * `minAgeMs` are omitted in the result. + */ +@Internal +public class ListFileSystemFiles extends ProcessFunction { + private static final Logger LOG = LoggerFactory.getLogger(ListFileSystemFiles.class); + + private final String taskName; + private final int taskIndex; + + private FileIO io; + private Map specs; + private String location; + private final long minAgeMs; + private transient Counter errorCounter; + private final TableLoader tableLoader; + private final boolean usePrefixListing; + private transient Configuration configuration; + + public ListFileSystemFiles( + String taskName, + int taskIndex, + TableLoader tableLoader, + String location, + long minAgeMs, + boolean usePrefixListing) { + Preconditions.checkNotNull(taskName, "Task name should no be null"); + Preconditions.checkNotNull(tableLoader, "TableLoad should no be null"); + + this.tableLoader = tableLoader; + this.taskName = taskName; + this.taskIndex = taskIndex; + this.minAgeMs = minAgeMs; + this.location = location; + this.usePrefixListing = usePrefixListing; + } + + @Override + public void open(OpenContext openContext) throws Exception { + super.open(openContext); + tableLoader.open(); + Table table = tableLoader.loadTable(); + this.io = table.io(); + this.location = location != null ? location : table.location(); + this.specs = table.specs(); + this.errorCounter = + TableMaintenanceMetrics.groupFor(getRuntimeContext(), table.name(), taskName, taskIndex) + .counter(TableMaintenanceMetrics.ERROR_COUNTER); + this.configuration = new Configuration(); + table.properties().forEach(configuration::set); + } + + @Override + public void processElement(Trigger trigger, Context ctx, Collector out) throws Exception { + long olderThanTimestamp = trigger.timestamp() - minAgeMs; + try { + if (usePrefixListing) { + Predicate predicate = fileInfo -> fileInfo.createdAtMillis() < olderThanTimestamp; + Preconditions.checkArgument( + io instanceof SupportsPrefixOperations, + "Cannot use prefix listing with FileIO {} which does not support prefix operations.", + io); + + FileSystemWalker.listDirRecursivelyWithFileIO( + (SupportsPrefixOperations) io, location, specs, predicate, out::collect); + } else { + Predicate predicate = file -> file.getModificationTime() < olderThanTimestamp; + FileSystemWalker.listDirRecursivelyWithHadoop( + location, + specs, + predicate, + configuration, + Integer.MAX_VALUE, + Integer.MAX_VALUE, + dir -> {}, + out::collect); + } + } catch (Exception e) { + LOG.warn("Exception listing files for {} at {}", location, ctx.timestamp(), e); + ctx.output(DeleteOrphanFiles.ERROR_STREAM, e); + errorCounter.inc(); + } + } + + @Override + public void close() throws Exception { + super.close(); + tableLoader.close(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListMetadataFiles.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListMetadataFiles.java new file mode 100644 index 000000000000..3ae42c60831c --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListMetadataFiles.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.metrics.Counter; +import org.apache.flink.streaming.api.functions.ProcessFunction; +import org.apache.flink.util.Collector; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.ReachableFileUtil; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Lists the metadata files referenced by the table. */ +@Internal +public class ListMetadataFiles extends ProcessFunction { + private static final Logger LOG = LoggerFactory.getLogger(ListMetadataFiles.class); + + private final String taskName; + private final int taskIndex; + private transient Counter errorCounter; + private final TableLoader tableLoader; + private transient Table table; + + public ListMetadataFiles(String taskName, int taskIndex, TableLoader tableLoader) { + Preconditions.checkNotNull(taskName, "Task name should no be null"); + Preconditions.checkNotNull(tableLoader, "TableLoader should no be null"); + this.tableLoader = tableLoader; + this.taskName = taskName; + this.taskIndex = taskIndex; + } + + @Override + public void open(OpenContext openContext) throws Exception { + super.open(openContext); + tableLoader.open(); + this.table = tableLoader.loadTable(); + this.errorCounter = + TableMaintenanceMetrics.groupFor(getRuntimeContext(), table.name(), taskName, taskIndex) + .counter(TableMaintenanceMetrics.ERROR_COUNTER); + } + + @Override + public void processElement(Trigger trigger, Context ctx, Collector collector) + throws Exception { + try { + table + .snapshots() + .forEach( + snapshot -> { + // Manifest lists + collector.collect(snapshot.manifestListLocation()); + // Snapshot JSONs + ReachableFileUtil.metadataFileLocations(table, false).forEach(collector::collect); + // Statistics files + ReachableFileUtil.statisticsFilesLocations(table).forEach(collector::collect); + // Version hint file for Hadoop catalogs + collector.collect(ReachableFileUtil.versionHintLocation(table)); + + // Emit the manifest file locations + snapshot.allManifests(table.io()).stream() + .map(ManifestFile::path) + .forEach(collector::collect); + }); + } catch (Exception e) { + LOG.error("Exception listing metadata files for {} at {}", table, ctx.timestamp(), e); + ctx.output(DeleteOrphanFiles.ERROR_STREAM, e); + errorCounter.inc(); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockFactoryBuilder.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockFactoryBuilder.java new file mode 100644 index 000000000000..ea91f13376a5 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockFactoryBuilder.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.util.Map; +import org.apache.commons.lang3.StringUtils; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.flink.maintenance.api.JdbcLockFactory; +import org.apache.iceberg.flink.maintenance.api.LockConfig; +import org.apache.iceberg.flink.maintenance.api.TriggerLockFactory; +import org.apache.iceberg.flink.maintenance.api.ZkLockFactory; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +@Internal +public class LockFactoryBuilder { + + private LockFactoryBuilder() {} + + public static TriggerLockFactory build(LockConfig lockConfig, String tableName) { + + String lockType = lockConfig.lockType(); + + Preconditions.checkArgument( + StringUtils.isNotEmpty(lockType), + "Configuration must contain key: %s", + LockConfig.LOCK_TYPE_OPTION.key()); + + // Set lock id to catalog.db.table if not set + switch (lockType) { + case LockConfig.JdbcLockConfig.JDBC: + return createJdbcLockFactory(lockConfig, tableName); + + case LockConfig.ZkLockConfig.ZK: + return createZkLockFactory(lockConfig, tableName); + + default: + throw new IllegalArgumentException(String.format("Unsupported lock type: %s ", lockType)); + } + } + + private static TriggerLockFactory createJdbcLockFactory(LockConfig lockConfig, String tableName) { + String jdbcUri = lockConfig.jdbcUri(); + String lockId = lockConfig.lockId(tableName); + Map properties = lockConfig.properties(); + Preconditions.checkArgument( + StringUtils.isNotEmpty(jdbcUri), + "JDBC lock requires %s parameter", + LockConfig.JdbcLockConfig.JDBC_URI_OPTION.key()); + + properties.put(JdbcLockFactory.INIT_LOCK_TABLES_PROPERTY, lockConfig.jdbcInitTable()); + + return new JdbcLockFactory(jdbcUri, lockId, properties); + } + + private static TriggerLockFactory createZkLockFactory(LockConfig lockConfig, String tableName) { + String zkUri = lockConfig.zkUri(); + String lockId = lockConfig.lockId(tableName); + Preconditions.checkArgument( + StringUtils.isNotEmpty(zkUri), + "Zk lock requires %s parameter", + LockConfig.ZkLockConfig.ZK_URI_OPTION.key()); + + return new ZkLockFactory( + zkUri, + lockId, + lockConfig.zkSessionTimeoutMs(), + lockConfig.zkConnectionTimeoutMs(), + lockConfig.zkBaseSleepMs(), + lockConfig.zkMaxRetries()); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockRemover.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockRemover.java new file mode 100644 index 000000000000..2066ca8e010e --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockRemover.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.util.List; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.flink.annotation.Internal; +import org.apache.flink.metrics.Counter; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.iceberg.flink.maintenance.api.TaskResult; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.apache.iceberg.flink.maintenance.api.TriggerLockFactory; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Manages locks and collect {@link org.apache.flink.metrics.Metric} for the Maintenance Tasks. + * + *

The assumptions about the locks are the following: + * + *

    + *
  • Every {@link TaskResult} is followed by a {@link Watermark} for normal {@link Trigger}s + *
  • For the {@link Trigger#recovery(long)} {@link Watermark} there is no element to process + *
+ * + * When processing the inputs there are 3 possibilities: + * + *
    + *
  • Normal execution - we receive a {@link TaskResult} and then a {@link Watermark} - unlocking + * the lock is handled by the {@link #processElement(StreamRecord)} + *
  • Recovery without ongoing execution (unlocking the recoveryLock) - we receive the {@link + * Trigger#recovery(long)} {@link Watermark} without any {@link TaskResult} - unlocking the + * {@link TriggerLockFactory#createRecoveryLock()} and a possible {@link + * TriggerLockFactory#createLock()} is handled by the {@link #processWatermark(Watermark)} + * (the {@link #lastProcessedTaskStartEpoch} is 0 in this case) + *
  • Recovery with an ongoing execution - we receive a {@link TaskResult} and then a {@link + * Watermark} - unlocking the {@link TriggerLockFactory#createLock()} is handled by the {@link + * #processElement(StreamRecord)}, unlocking the {@link + * TriggerLockFactory#createRecoveryLock()} is handled by the {@link + * #processWatermark(Watermark)} (the {@link #lastProcessedTaskStartEpoch} is the start time + * of the old task) + *
+ */ +@Internal +public class LockRemover extends AbstractStreamOperator + implements OneInputStreamOperator { + private static final Logger LOG = LoggerFactory.getLogger(LockRemover.class); + + private final String tableName; + private final TriggerLockFactory lockFactory; + private final List maintenanceTaskNames; + + private transient List succeededTaskResultCounters; + private transient List failedTaskResultCounters; + private transient List taskLastRunDurationMs; + private transient TriggerLockFactory.Lock lock; + private transient TriggerLockFactory.Lock recoveryLock; + private transient long lastProcessedTaskStartEpoch = 0L; + + public LockRemover( + String tableName, TriggerLockFactory lockFactory, List maintenanceTaskNames) { + Preconditions.checkNotNull(lockFactory, "Lock factory should no be null"); + Preconditions.checkArgument( + maintenanceTaskNames != null && !maintenanceTaskNames.isEmpty(), + "Invalid maintenance task names: null or empty"); + + this.tableName = tableName; + this.lockFactory = lockFactory; + this.maintenanceTaskNames = maintenanceTaskNames; + } + + @Override + public void open() throws Exception { + super.open(); + this.succeededTaskResultCounters = + Lists.newArrayListWithExpectedSize(maintenanceTaskNames.size()); + this.failedTaskResultCounters = Lists.newArrayListWithExpectedSize(maintenanceTaskNames.size()); + this.taskLastRunDurationMs = Lists.newArrayListWithExpectedSize(maintenanceTaskNames.size()); + for (int taskIndex = 0; taskIndex < maintenanceTaskNames.size(); ++taskIndex) { + MetricGroup taskMetricGroup = + TableMaintenanceMetrics.groupFor( + getRuntimeContext(), tableName, maintenanceTaskNames.get(taskIndex), taskIndex); + succeededTaskResultCounters.add( + taskMetricGroup.counter(TableMaintenanceMetrics.SUCCEEDED_TASK_COUNTER)); + failedTaskResultCounters.add( + taskMetricGroup.counter(TableMaintenanceMetrics.FAILED_TASK_COUNTER)); + AtomicLong duration = new AtomicLong(0); + taskLastRunDurationMs.add(duration); + taskMetricGroup.gauge(TableMaintenanceMetrics.LAST_RUN_DURATION_MS, duration::get); + } + + lockFactory.open(); + this.lock = lockFactory.createLock(); + this.recoveryLock = lockFactory.createRecoveryLock(); + } + + @Override + public void processElement(StreamRecord streamRecord) { + TaskResult taskResult = streamRecord.getValue(); + LOG.info( + "Processing result {} for task {}", + taskResult, + maintenanceTaskNames.get(taskResult.taskIndex())); + long duration = System.currentTimeMillis() - taskResult.startEpoch(); + lock.unlock(); + this.lastProcessedTaskStartEpoch = taskResult.startEpoch(); + + // Update the metrics + taskLastRunDurationMs.get(taskResult.taskIndex()).set(duration); + if (taskResult.success()) { + succeededTaskResultCounters.get(taskResult.taskIndex()).inc(); + } else { + failedTaskResultCounters.get(taskResult.taskIndex()).inc(); + } + } + + @Override + public void processWatermark(Watermark mark) { + if (mark.getTimestamp() > lastProcessedTaskStartEpoch) { + lock.unlock(); + recoveryLock.unlock(); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LogUtil.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LogUtil.java new file mode 100644 index 000000000000..8bdcd7ba2b57 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LogUtil.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +class LogUtil { + static final String MESSAGE_PREFIX = "[For table {} with {}[{}] at {}]: "; + static final String MESSAGE_FORMAT_PREFIX = "[For table %s with {%s}[{%d}] at {%d}]: "; + + private LogUtil() {} +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MetadataTablePlanner.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MetadataTablePlanner.java new file mode 100644 index 000000000000..c03b3be1a977 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MetadataTablePlanner.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.util.concurrent.ExecutorService; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.metrics.Counter; +import org.apache.flink.streaming.api.functions.ProcessFunction; +import org.apache.flink.util.Collector; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.MetadataTableUtils; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.apache.iceberg.flink.source.FlinkSplitPlanner; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitSerializer; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.util.ThreadPools; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Plans the splits to read a metadata table content. */ +@Internal +public class MetadataTablePlanner extends ProcessFunction { + private static final Logger LOG = LoggerFactory.getLogger(MetadataTablePlanner.class); + + private final String taskName; + private final int taskIndex; + private final TableLoader tableLoader; + private final int workerPoolSize; + private final ScanContext scanContext; + private transient ExecutorService workerPool; + private transient Counter errorCounter; + private transient Table table; + private transient IcebergSourceSplitSerializer splitSerializer; + private final MetadataTableType metadataTableType; + + public MetadataTablePlanner( + String taskName, + int taskIndex, + TableLoader tableLoader, + ScanContext scanContext, + MetadataTableType metadataTableType, + int workerPoolSize) { + Preconditions.checkNotNull(taskName, "Task name should no be null"); + Preconditions.checkNotNull(tableLoader, "Table should no be null"); + Preconditions.checkArgument(scanContext.isStreaming(), "Streaming should be set to true"); + + this.taskName = taskName; + this.taskIndex = taskIndex; + this.tableLoader = tableLoader; + this.scanContext = scanContext; + this.workerPoolSize = workerPoolSize; + this.metadataTableType = metadataTableType; + } + + @Override + public void open(OpenContext openContext) throws Exception { + tableLoader.open(); + Table originalTable = tableLoader.loadTable(); + this.table = MetadataTableUtils.createMetadataTableInstance(originalTable, metadataTableType); + this.workerPool = + ThreadPools.newFixedThreadPool(table.name() + "-table-planner", workerPoolSize); + this.splitSerializer = new IcebergSourceSplitSerializer(scanContext.caseSensitive()); + this.errorCounter = + TableMaintenanceMetrics.groupFor( + getRuntimeContext(), originalTable.name(), taskName, taskIndex) + .counter(TableMaintenanceMetrics.ERROR_COUNTER); + } + + @Override + public void processElement(Trigger trigger, Context ctx, Collector out) + throws Exception { + try { + table.refresh(); + for (IcebergSourceSplit split : + FlinkSplitPlanner.planIcebergSourceSplits(table, scanContext, workerPool)) { + out.collect(new SplitInfo(splitSerializer.getVersion(), splitSerializer.serialize(split))); + } + } catch (Exception e) { + LOG.warn("Exception planning scan for {} at {}", table, ctx.timestamp(), e); + ctx.output(DeleteOrphanFiles.ERROR_STREAM, e); + errorCounter.inc(); + } + } + + @Override + public void close() throws Exception { + super.close(); + tableLoader.close(); + if (workerPool != null) { + workerPool.shutdown(); + } + } + + public static class SplitInfo { + private final int version; + private final byte[] split; + + public SplitInfo(int version, byte[] split) { + this.version = version; + this.split = split; + } + + public int version() { + return version; + } + + public byte[] split() { + return split; + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java new file mode 100644 index 000000000000..d74b2349b1de --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.io.IOException; +import java.util.Iterator; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.source.Boundedness; +import org.apache.flink.api.connector.source.SourceReader; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.api.connector.source.util.ratelimit.RateLimitedSourceReader; +import org.apache.flink.api.connector.source.util.ratelimit.RateLimiter; +import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.iceberg.DataOperations; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Monitors an Iceberg table for changes */ +@Internal +public class MonitorSource extends SingleThreadedIteratorSource { + private static final Logger LOG = LoggerFactory.getLogger(MonitorSource.class); + + private final TableLoader tableLoader; + private final RateLimiterStrategy rateLimiterStrategy; + private final long maxReadBack; + + /** + * Creates a {@link org.apache.flink.api.connector.source.Source} which monitors an Iceberg table + * for changes. + * + * @param tableLoader used for accessing the table + * @param rateLimiterStrategy limits the frequency the table is checked + * @param maxReadBack sets the number of snapshots read before stopping change collection + */ + public MonitorSource( + TableLoader tableLoader, RateLimiterStrategy rateLimiterStrategy, long maxReadBack) { + Preconditions.checkNotNull(tableLoader, "Table loader should no be null"); + Preconditions.checkNotNull(rateLimiterStrategy, "Rate limiter strategy should no be null"); + Preconditions.checkArgument(maxReadBack > 0, "Need to read at least 1 snapshot to work"); + + this.tableLoader = tableLoader; + this.rateLimiterStrategy = rateLimiterStrategy; + this.maxReadBack = maxReadBack; + } + + @Override + public Boundedness getBoundedness() { + return Boundedness.CONTINUOUS_UNBOUNDED; + } + + @Override + public TypeInformation getProducedType() { + return TypeInformation.of(TableChange.class); + } + + @Override + Iterator createIterator() { + return new TableChangeIterator(tableLoader, null, maxReadBack); + } + + @Override + SimpleVersionedSerializer> iteratorSerializer() { + return new TableChangeIteratorSerializer(tableLoader, maxReadBack); + } + + @Override + public SourceReader> createReader( + SourceReaderContext readerContext) throws Exception { + RateLimiter rateLimiter = rateLimiterStrategy.createRateLimiter(1); + return new RateLimitedSourceReader<>(super.createReader(readerContext), rateLimiter); + } + + /** The Iterator which returns the latest changes on an Iceberg table. */ + @VisibleForTesting + static class TableChangeIterator implements Iterator { + private Long lastSnapshotId; + private final long maxReadBack; + private final Table table; + + TableChangeIterator(TableLoader tableLoader, Long lastSnapshotId, long maxReadBack) { + this.lastSnapshotId = lastSnapshotId; + this.maxReadBack = maxReadBack; + tableLoader.open(); + this.table = tableLoader.loadTable(); + } + + @Override + public boolean hasNext() { + return true; + } + + @Override + public TableChange next() { + try { + table.refresh(); + Snapshot currentSnapshot = table.currentSnapshot(); + Long current = currentSnapshot != null ? currentSnapshot.snapshotId() : null; + Long checking = current; + TableChange event = TableChange.empty(); + long readBack = 0; + while (checking != null && !checking.equals(lastSnapshotId) && ++readBack <= maxReadBack) { + Snapshot snapshot = table.snapshot(checking); + if (snapshot != null) { + if (!DataOperations.REPLACE.equals(snapshot.operation())) { + LOG.debug("Reading snapshot {}", snapshot.snapshotId()); + event.merge(new TableChange(snapshot, table.io())); + } else { + LOG.debug("Skipping replace snapshot {}", snapshot.snapshotId()); + } + + checking = snapshot.parentId(); + } else { + // If the last snapshot has been removed from the history + checking = null; + } + } + + lastSnapshotId = current; + return event; + } catch (Exception e) { + LOG.warn("Failed to fetch table changes for {}", table, e); + return TableChange.empty(); + } + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("lastSnapshotId", lastSnapshotId) + .add("maxReadBack", maxReadBack) + .add("table", table) + .toString(); + } + } + + private static final class TableChangeIteratorSerializer + implements SimpleVersionedSerializer> { + + private static final int CURRENT_VERSION = 1; + private final TableLoader tableLoader; + private final long maxReadBack; + + TableChangeIteratorSerializer(TableLoader tableLoader, long maxReadBack) { + this.tableLoader = tableLoader; + this.maxReadBack = maxReadBack; + } + + @Override + public int getVersion() { + return CURRENT_VERSION; + } + + @Override + public byte[] serialize(Iterator iterator) throws IOException { + Preconditions.checkArgument( + iterator instanceof TableChangeIterator, + "Use TableChangeIterator iterator. Found incompatible type: %s", + iterator.getClass()); + + TableChangeIterator tableChangeIterator = (TableChangeIterator) iterator; + DataOutputSerializer out = new DataOutputSerializer(8); + long toStore = + tableChangeIterator.lastSnapshotId != null ? tableChangeIterator.lastSnapshotId : -1L; + out.writeLong(toStore); + return out.getCopyOfBuffer(); + } + + @Override + public TableChangeIterator deserialize(int version, byte[] serialized) throws IOException { + if (version == CURRENT_VERSION) { + DataInputDeserializer in = new DataInputDeserializer(serialized); + long fromStore = in.readLong(); + return new TableChangeIterator( + tableLoader, fromStore != -1 ? fromStore : null, maxReadBack); + } else { + throw new IOException("Unrecognized version or corrupt state: " + version); + } + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/OrphanFilesDetector.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/OrphanFilesDetector.java new file mode 100644 index 000000000000..5c602f4f1e54 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/OrphanFilesDetector.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import org.apache.commons.lang3.StringUtils; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.api.common.state.MapState; +import org.apache.flink.api.common.state.MapStateDescriptor; +import org.apache.flink.api.common.state.ValueState; +import org.apache.flink.api.common.state.ValueStateDescriptor; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.streaming.api.functions.co.KeyedCoProcessFunction; +import org.apache.flink.util.Collector; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.actions.DeleteOrphanFiles; +import org.apache.iceberg.actions.FileURI; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A specialized co-process function that performs an anti-join between two streams of file URIs. + * + *

Emits every file that exists in the file system but is not referenced in the table metadata, + * which are considered orphan files. It also handles URI normalization using provided scheme and + * authority equivalence mappings. + */ +@Internal +public class OrphanFilesDetector extends KeyedCoProcessFunction { + private static final Logger LOG = LoggerFactory.getLogger(OrphanFilesDetector.class); + + // Use MapState to dedupe the strings found in the table + private transient MapState foundInTable; + private transient ValueState foundInFileSystem; + private transient ValueState hasUriError; + private final DeleteOrphanFiles.PrefixMismatchMode prefixMismatchMode; + private final Map equalSchemes; + private final Map equalAuthorities; + + public OrphanFilesDetector( + DeleteOrphanFiles.PrefixMismatchMode prefixMismatchMode, + Map equalSchemes, + Map equalAuthorities) { + this.prefixMismatchMode = prefixMismatchMode; + this.equalSchemes = equalSchemes; + this.equalAuthorities = equalAuthorities; + } + + @Override + public void open(OpenContext openContext) throws Exception { + super.open(openContext); + foundInTable = + getRuntimeContext() + .getMapState( + new MapStateDescriptor<>("antiJoinFoundInTable", Types.STRING, Types.BOOLEAN)); + hasUriError = + getRuntimeContext().getState(new ValueStateDescriptor<>("antiJoinUriError", Types.BOOLEAN)); + foundInFileSystem = + getRuntimeContext() + .getState(new ValueStateDescriptor<>("antiJoinFoundInFileSystem", Types.STRING)); + } + + @Override + public void processElement1(String value, Context context, Collector collector) + throws Exception { + if (shouldSkipElement(value, context)) { + return; + } + + if (!foundInTable.contains(value)) { + foundInTable.put(value, true); + context.timerService().registerEventTimeTimer(context.timestamp()); + } + } + + @Override + public void processElement2(String value, Context context, Collector collector) + throws Exception { + if (shouldSkipElement(value, context)) { + return; + } + + foundInFileSystem.update(value); + context.timerService().registerEventTimeTimer(context.timestamp()); + } + + @Override + public void onTimer(long timestamp, OnTimerContext ctx, Collector out) throws Exception { + if (Boolean.TRUE.equals(hasUriError.value())) { + clearState(); + return; + } + + List foundInTablesList = Lists.newArrayList(); + foundInTable + .keys() + .forEach( + uri -> + foundInTablesList.add( + new FileURI(new Path(uri).toUri(), equalSchemes, equalAuthorities))); + + if (foundInFileSystem.value() != null) { + if (foundInTablesList.isEmpty()) { + FileURI fileURI = + new FileURI( + new Path(foundInFileSystem.value()).toUri(), equalSchemes, equalAuthorities); + out.collect(fileURI.getUriAsString()); + } else { + FileURI actual = + new FileURI( + new Path(foundInFileSystem.value()).toUri(), equalSchemes, equalAuthorities); + if (hasMismatch(actual, foundInTablesList)) { + if (prefixMismatchMode == DeleteOrphanFiles.PrefixMismatchMode.DELETE) { + out.collect(foundInFileSystem.value()); + } else if (prefixMismatchMode == DeleteOrphanFiles.PrefixMismatchMode.ERROR) { + ValidationException validationException = + new ValidationException( + "Unable to determine whether certain files are orphan. " + + "Metadata references files that match listed/provided files except for authority/scheme. " + + "Please, inspect the conflicting authorities/schemes and provide which of them are equal " + + "by further configuring the action via equalSchemes() and equalAuthorities() methods. " + + "Set the prefix mismatch mode to 'NONE' to ignore remaining locations with conflicting " + + "authorities/schemes or to 'DELETE' if you are ABSOLUTELY confident that remaining conflicting " + + "authorities/schemes are different. It will be impossible to recover deleted files. " + + "Conflicting authorities/schemes"); + LOG.warn( + "Unable to determine whether certain files are orphan. Found in filesystem: {} and in table: {}", + actual, + StringUtils.join(foundInTablesList, ","), + validationException); + ctx.output( + org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles.ERROR_STREAM, + validationException); + } + } + } + } + + clearState(); + } + + private boolean hasMismatch(FileURI actual, List foundInTablesList) { + return foundInTablesList.stream() + .noneMatch(valid -> valid.schemeMatch(actual) && valid.authorityMatch(actual)); + } + + private boolean shouldSkipElement(String value, Context context) throws IOException { + if (Boolean.TRUE.equals(hasUriError.value())) { + return true; + } + + if (FileUriKeySelector.INVALID_URI.equals(context.getCurrentKey())) { + context.output( + org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles.ERROR_STREAM, + new RuntimeException("Invalid URI format detected: " + value)); + hasUriError.update(true); + foundInTable.clear(); + foundInFileSystem.clear(); + return true; + } + + return false; + } + + private void clearState() { + hasUriError.clear(); + foundInTable.clear(); + foundInFileSystem.clear(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java new file mode 100644 index 000000000000..20c7684d9700 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.io.IOException; +import java.util.Collection; +import java.util.Iterator; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.connector.source.Source; +import org.apache.flink.api.connector.source.SourceReader; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.api.connector.source.SplitEnumerator; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.flink.api.connector.source.lib.util.IteratorSourceEnumerator; +import org.apache.flink.api.connector.source.lib.util.IteratorSourceReader; +import org.apache.flink.api.connector.source.lib.util.IteratorSourceSplit; +import org.apache.flink.api.java.typeutils.ResultTypeQueryable; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +/** + * Implementation of the Source V2 API which uses an iterator to read the elements, and uses a + * single thread to do so. + * + * @param The return type of the source + */ +@Internal +public abstract class SingleThreadedIteratorSource + implements Source< + T, + SingleThreadedIteratorSource.GlobalSplit, + Collection>>, + ResultTypeQueryable { + private static final String PARALLELISM_ERROR = "Parallelism should be set to 1"; + + /** + * Creates the iterator to return the elements which then emitted by the source. + * + * @return iterator for the elements + */ + abstract Iterator createIterator(); + + /** + * Serializes the iterator, which is used to save and restore the state of the source. + * + * @return serializer for the iterator + */ + abstract SimpleVersionedSerializer> iteratorSerializer(); + + @Override + public SplitEnumerator, Collection>> createEnumerator( + SplitEnumeratorContext> enumContext) { + Preconditions.checkArgument(enumContext.currentParallelism() == 1, PARALLELISM_ERROR); + return new IteratorSourceEnumerator<>( + enumContext, ImmutableList.of(new GlobalSplit<>(createIterator()))); + } + + @Override + public SplitEnumerator, Collection>> restoreEnumerator( + SplitEnumeratorContext> enumContext, Collection> checkpoint) { + Preconditions.checkArgument(enumContext.currentParallelism() == 1, PARALLELISM_ERROR); + return new IteratorSourceEnumerator<>(enumContext, checkpoint); + } + + @Override + public SimpleVersionedSerializer> getSplitSerializer() { + return new SplitSerializer<>(iteratorSerializer()); + } + + @Override + public SimpleVersionedSerializer>> getEnumeratorCheckpointSerializer() { + return new EnumeratorSerializer<>(iteratorSerializer()); + } + + @Override + public SourceReader> createReader(SourceReaderContext readerContext) + throws Exception { + Preconditions.checkArgument(readerContext.getIndexOfSubtask() == 0, PARALLELISM_ERROR); + return new IteratorSourceReader<>(readerContext); + } + + /** The single split of the {@link SingleThreadedIteratorSource}. */ + static class GlobalSplit implements IteratorSourceSplit> { + private final Iterator iterator; + + GlobalSplit(Iterator iterator) { + this.iterator = iterator; + } + + @Override + public String splitId() { + return "1"; + } + + @Override + public Iterator getIterator() { + return iterator; + } + + @Override + public IteratorSourceSplit> getUpdatedSplitForIterator( + final Iterator newIterator) { + return new GlobalSplit<>(newIterator); + } + + @Override + public String toString() { + return String.format("GlobalSplit (%s)", iterator); + } + } + + private static final class SplitSerializer + implements SimpleVersionedSerializer> { + private final SimpleVersionedSerializer> iteratorSerializer; + + SplitSerializer(SimpleVersionedSerializer> iteratorSerializer) { + this.iteratorSerializer = iteratorSerializer; + } + + private static final int CURRENT_VERSION = 1; + + @Override + public int getVersion() { + return CURRENT_VERSION; + } + + @Override + public byte[] serialize(GlobalSplit split) throws IOException { + return iteratorSerializer.serialize(split.iterator); + } + + @Override + public GlobalSplit deserialize(int version, byte[] serialized) throws IOException { + return new GlobalSplit<>(iteratorSerializer.deserialize(version, serialized)); + } + } + + private static final class EnumeratorSerializer + implements SimpleVersionedSerializer>> { + private static final int CURRENT_VERSION = 1; + private final SimpleVersionedSerializer> iteratorSerializer; + + EnumeratorSerializer(SimpleVersionedSerializer> iteratorSerializer) { + this.iteratorSerializer = iteratorSerializer; + } + + @Override + public int getVersion() { + return CURRENT_VERSION; + } + + @Override + public byte[] serialize(Collection> checkpoint) throws IOException { + Preconditions.checkArgument(checkpoint.size() < 2, PARALLELISM_ERROR); + if (checkpoint.isEmpty()) { + return new byte[] {0}; + } else { + byte[] iterator = iteratorSerializer.serialize(checkpoint.iterator().next().getIterator()); + byte[] result = new byte[iterator.length + 1]; + result[0] = 1; + System.arraycopy(iterator, 0, result, 1, iterator.length); + return result; + } + } + + @Override + public Collection> deserialize(int version, byte[] serialized) + throws IOException { + if (serialized[0] == 0) { + return Lists.newArrayList(); + } else { + byte[] iterator = new byte[serialized.length - 1]; + System.arraycopy(serialized, 1, iterator, 0, serialized.length - 1); + return Lists.newArrayList( + new GlobalSplit<>(iteratorSerializer.deserialize(version, iterator))); + } + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SkipOnError.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SkipOnError.java new file mode 100644 index 000000000000..8a185ba8a912 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SkipOnError.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.TwoInputStreamOperator; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Skip file deletion processing when an error is encountered. */ +@Internal +public class SkipOnError extends AbstractStreamOperator + implements TwoInputStreamOperator { + private static final Logger LOG = LoggerFactory.getLogger(SkipOnError.class); + private transient ListState filesToDelete; + private transient ListState hasError; + private boolean hasErrorFlag = false; + + @Override + public void initializeState(StateInitializationContext context) throws Exception { + super.initializeState(context); + this.filesToDelete = + context + .getOperatorStateStore() + .getListState(new ListStateDescriptor<>("blockOnErrorFiles", String.class)); + this.hasError = + context + .getOperatorStateStore() + .getListState(new ListStateDescriptor<>("blockOnErrorHasError", Types.BOOLEAN)); + + if (!Iterables.isEmpty(hasError.get())) { + hasErrorFlag = true; + } + } + + @Override + public void processElement1(StreamRecord element) throws Exception { + if (!hasErrorFlag) { + filesToDelete.add(element.getValue()); + } + } + + @Override + public void processElement2(StreamRecord element) throws Exception { + hasError.add(true); + hasErrorFlag = true; + filesToDelete.clear(); + } + + @Override + public void processWatermark(Watermark mark) throws Exception { + try { + if (!hasErrorFlag) { + filesToDelete.get().forEach(file -> output.collect(new StreamRecord<>(file))); + } else { + LOG.info("Omitting result on failure at {}", mark.getTimestamp()); + } + } finally { + filesToDelete.clear(); + hasError.clear(); + hasErrorFlag = false; + } + + super.processWatermark(mark); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java new file mode 100644 index 000000000000..87600c52304a --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java @@ -0,0 +1,244 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.util.Objects; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; + +/** Event describing changes in an Iceberg table */ +@Internal +public class TableChange { + private int dataFileCount; + private long dataFileSizeInBytes; + private int posDeleteFileCount; + private long posDeleteRecordCount; + private int eqDeleteFileCount; + private long eqDeleteRecordCount; + private int commitCount; + + private TableChange( + int dataFileCount, + long dataFileSizeInBytes, + int posDeleteFileCount, + long posDeleteRecordCount, + int eqDeleteFileCount, + long eqDeleteRecordCount, + int commitCount) { + this.dataFileCount = dataFileCount; + this.dataFileSizeInBytes = dataFileSizeInBytes; + this.posDeleteFileCount = posDeleteFileCount; + this.posDeleteRecordCount = posDeleteRecordCount; + this.eqDeleteFileCount = eqDeleteFileCount; + this.eqDeleteRecordCount = eqDeleteRecordCount; + this.commitCount = commitCount; + } + + TableChange(Snapshot snapshot, FileIO io) { + this(snapshot.addedDataFiles(io), snapshot.addedDeleteFiles(io)); + } + + public TableChange(Iterable dataFiles, Iterable deleteFiles) { + dataFiles.forEach( + dataFile -> { + this.dataFileCount++; + this.dataFileSizeInBytes += dataFile.fileSizeInBytes(); + }); + + deleteFiles.forEach( + deleteFile -> { + switch (deleteFile.content()) { + case POSITION_DELETES: + this.posDeleteFileCount++; + this.posDeleteRecordCount += deleteFile.recordCount(); + break; + case EQUALITY_DELETES: + this.eqDeleteFileCount++; + this.eqDeleteRecordCount += deleteFile.recordCount(); + break; + default: + throw new IllegalArgumentException("Unexpected delete file content: " + deleteFile); + } + }); + + this.commitCount = 1; + } + + static TableChange empty() { + return new TableChange(0, 0L, 0, 0L, 0, 0L, 0); + } + + public static Builder builder() { + return new Builder(); + } + + int dataFileCount() { + return dataFileCount; + } + + long dataFileSizeInBytes() { + return dataFileSizeInBytes; + } + + int posDeleteFileCount() { + return posDeleteFileCount; + } + + long posDeleteRecordCount() { + return posDeleteRecordCount; + } + + int eqDeleteFileCount() { + return eqDeleteFileCount; + } + + long eqDeleteRecordCount() { + return eqDeleteRecordCount; + } + + int commitCount() { + return commitCount; + } + + public void merge(TableChange other) { + this.dataFileCount += other.dataFileCount; + this.dataFileSizeInBytes += other.dataFileSizeInBytes; + this.posDeleteFileCount += other.posDeleteFileCount; + this.posDeleteRecordCount += other.posDeleteRecordCount; + this.eqDeleteFileCount += other.eqDeleteFileCount; + this.eqDeleteRecordCount += other.eqDeleteRecordCount; + this.commitCount += other.commitCount; + } + + TableChange copy() { + return new TableChange( + dataFileCount, + dataFileSizeInBytes, + posDeleteFileCount, + posDeleteRecordCount, + eqDeleteFileCount, + eqDeleteRecordCount, + commitCount); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("dataFileCount", dataFileCount) + .add("dataFileSizeInBytes", dataFileSizeInBytes) + .add("posDeleteFileCount", posDeleteFileCount) + .add("posDeleteRecordCount", posDeleteRecordCount) + .add("eqDeleteFileCount", eqDeleteFileCount) + .add("eqDeleteRecordCount", eqDeleteRecordCount) + .add("commitCount", commitCount) + .toString(); + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } else if (other == null || getClass() != other.getClass()) { + return false; + } + + TableChange that = (TableChange) other; + return dataFileCount == that.dataFileCount + && dataFileSizeInBytes == that.dataFileSizeInBytes + && posDeleteFileCount == that.posDeleteFileCount + && posDeleteRecordCount == that.posDeleteRecordCount + && eqDeleteFileCount == that.eqDeleteFileCount + && eqDeleteRecordCount == that.eqDeleteRecordCount + && commitCount == that.commitCount; + } + + @Override + public int hashCode() { + return Objects.hash( + dataFileCount, + dataFileSizeInBytes, + posDeleteFileCount, + posDeleteRecordCount, + eqDeleteFileCount, + eqDeleteRecordCount, + commitCount); + } + + public static class Builder { + private int dataFileCount = 0; + private long dataFileSizeInBytes = 0L; + private int posDeleteFileCount = 0; + private long posDeleteRecordCount = 0L; + private int eqDeleteFileCount = 0; + private long eqDeleteRecordCount = 0L; + private int commitCount = 0; + + private Builder() {} + + public Builder dataFileCount(int newDataFileCount) { + this.dataFileCount = newDataFileCount; + return this; + } + + public Builder dataFileSizeInBytes(long newDataFileSizeInBytes) { + this.dataFileSizeInBytes = newDataFileSizeInBytes; + return this; + } + + public Builder posDeleteFileCount(int newPosDeleteFileCount) { + this.posDeleteFileCount = newPosDeleteFileCount; + return this; + } + + public Builder posDeleteRecordCount(long newPosDeleteRecordCount) { + this.posDeleteRecordCount = newPosDeleteRecordCount; + return this; + } + + public Builder eqDeleteFileCount(int newEqDeleteFileCount) { + this.eqDeleteFileCount = newEqDeleteFileCount; + return this; + } + + public Builder eqDeleteRecordCount(long newEqDeleteRecordCount) { + this.eqDeleteRecordCount = newEqDeleteRecordCount; + return this; + } + + public Builder commitCount(int newCommitCount) { + this.commitCount = newCommitCount; + return this; + } + + public TableChange build() { + return new TableChange( + dataFileCount, + dataFileSizeInBytes, + posDeleteFileCount, + posDeleteRecordCount, + eqDeleteFileCount, + eqDeleteRecordCount, + commitCount); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableMaintenanceMetrics.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableMaintenanceMetrics.java new file mode 100644 index 000000000000..897760caaacc --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableMaintenanceMetrics.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import org.apache.flink.api.common.functions.RuntimeContext; +import org.apache.flink.metrics.MetricGroup; + +public class TableMaintenanceMetrics { + public static final String GROUP_KEY = "maintenance"; + public static final String TASK_NAME_KEY = "taskName"; + public static final String TASK_INDEX_KEY = "taskIndex"; + public static final String TABLE_NAME_KEY = "tableName"; + + // Operator error counter + public static final String ERROR_COUNTER = "error"; + + // TriggerManager metrics + public static final String RATE_LIMITER_TRIGGERED = "rateLimiterTriggered"; + public static final String CONCURRENT_RUN_THROTTLED = "concurrentRunThrottled"; + public static final String TRIGGERED = "triggered"; + public static final String NOTHING_TO_TRIGGER = "nothingToTrigger"; + + // LockRemover metrics + public static final String SUCCEEDED_TASK_COUNTER = "succeededTasks"; + public static final String FAILED_TASK_COUNTER = "failedTasks"; + public static final String LAST_RUN_DURATION_MS = "lastRunDurationMs"; + + // DeleteFiles metrics + public static final String DELETE_FILE_FAILED_COUNTER = "deleteFailed"; + public static final String DELETE_FILE_SUCCEEDED_COUNTER = "deleteSucceeded"; + + // DataFileUpdater metrics + public static final String ADDED_DATA_FILE_NUM_METRIC = "addedDataFileNum"; + public static final String ADDED_DATA_FILE_SIZE_METRIC = "addedDataFileSize"; + public static final String REMOVED_DATA_FILE_NUM_METRIC = "removedDataFileNum"; + public static final String REMOVED_DATA_FILE_SIZE_METRIC = "removedDataFileSize"; + + static MetricGroup groupFor( + RuntimeContext context, String tableName, String taskName, int taskIndex) { + return groupFor(groupFor(context, tableName), taskName, taskIndex); + } + + static MetricGroup groupFor(RuntimeContext context, String tableName) { + return context + .getMetricGroup() + .addGroup(TableMaintenanceMetrics.GROUP_KEY) + .addGroup(TableMaintenanceMetrics.TABLE_NAME_KEY, tableName); + } + + static MetricGroup groupFor(MetricGroup mainGroup, String taskName, int taskIndex) { + return mainGroup + .addGroup(TableMaintenanceMetrics.TASK_NAME_KEY, taskName) + .addGroup(TableMaintenanceMetrics.TASK_INDEX_KEY, String.valueOf(taskIndex)); + } + + private TableMaintenanceMetrics() { + // do not instantiate + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableReader.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableReader.java new file mode 100644 index 000000000000..0b6b09b8902a --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableReader.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.metrics.Counter; +import org.apache.flink.streaming.api.functions.ProcessFunction; +import org.apache.flink.table.data.RowData; +import org.apache.flink.util.Collector; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.MetadataTableUtils; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.reader.MetaDataReaderFunction; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitSerializer; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Reads the records from the metadata table splits. */ +abstract class TableReader extends ProcessFunction { + private static final Logger LOG = LoggerFactory.getLogger(TableReader.class); + + private final TableLoader tableLoader; + private final String taskName; + private final int taskIndex; + private final Schema projectedSchema; + private IcebergSourceSplitSerializer splitSerializer; + private final ScanContext scanContext; + private final MetadataTableType metadataTableType; + + private transient MetaDataReaderFunction rowDataReaderFunction; + private transient Counter errorCounter; + + TableReader( + String taskName, + int taskIndex, + TableLoader tableLoader, + Schema projectedSchema, + ScanContext scanContext, + MetadataTableType metadataTableType) { + Preconditions.checkNotNull(taskName, "Task name should no be null"); + Preconditions.checkNotNull(tableLoader, "Table should no be null"); + Preconditions.checkNotNull(projectedSchema, "The projected schema should no be null"); + + this.tableLoader = tableLoader; + this.taskName = taskName; + this.taskIndex = taskIndex; + this.projectedSchema = projectedSchema; + this.scanContext = scanContext; + this.metadataTableType = metadataTableType; + } + + @Override + public void open(OpenContext openContext) throws Exception { + tableLoader.open(); + Table table = tableLoader.loadTable(); + Table metaTable = MetadataTableUtils.createMetadataTableInstance(table, metadataTableType); + this.errorCounter = + TableMaintenanceMetrics.groupFor(getRuntimeContext(), table.name(), taskName, taskIndex) + .counter(TableMaintenanceMetrics.ERROR_COUNTER); + this.rowDataReaderFunction = + new MetaDataReaderFunction( + new Configuration(), + metaTable.schema(), + projectedSchema, + metaTable.io(), + metaTable.encryption()); + this.splitSerializer = new IcebergSourceSplitSerializer(scanContext.caseSensitive()); + } + + @Override + public void processElement( + MetadataTablePlanner.SplitInfo splitInfo, Context ctx, Collector out) throws Exception { + IcebergSourceSplit split = splitSerializer.deserialize(splitInfo.version(), splitInfo.split()); + try (DataIterator iterator = rowDataReaderFunction.createDataIterator(split)) { + iterator.forEachRemaining(rowData -> extract(rowData, out)); + } catch (Exception e) { + LOG.warn("Exception processing split {} at {}", split, ctx.timestamp(), e); + ctx.output(DeleteOrphanFiles.ERROR_STREAM, e); + errorCounter.inc(); + } + } + + @Override + public void close() throws Exception { + super.close(); + tableLoader.close(); + } + + /** + * Extracts the desired data from the given RowData. + * + * @param rowData the RowData from which to extract + * @param out the Collector to which to output the extracted data + */ + abstract void extract(RowData rowData, Collector out); +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TaskResultAggregator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TaskResultAggregator.java new file mode 100644 index 000000000000..bd8f709e37ab --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TaskResultAggregator.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.util.List; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.TwoInputStreamOperator; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.util.OutputTag; +import org.apache.iceberg.flink.maintenance.api.TaskResult; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Aggregates results of the operators for a given maintenance task. + * + *

    + *
  • Input 1 is used: + *
      + *
    • To provide the {@link TaskResult#startEpoch()} - should be chained to the task input + *
    • To mark that the task is finished - should be chained at the end of the task, so an + * incoming watermark will signal that the task is finished + *
    + *
  • Input 2 expects an {@link Exception} which caused the failure - should be chained to the + * {@link #ERROR_STREAM} of the operators + *
+ * + * The operator emits a {@link TaskResult} with the overall result on {@link Watermark}. + */ +@Internal +public class TaskResultAggregator extends AbstractStreamOperator + implements TwoInputStreamOperator { + public static final OutputTag ERROR_STREAM = + new OutputTag<>("error-stream", TypeInformation.of(Exception.class)); + + private static final Logger LOG = LoggerFactory.getLogger(TaskResultAggregator.class); + + private final String tableName; + private final String taskName; + private final int taskIndex; + private final List exceptions; + private transient long startTime; + + public TaskResultAggregator(String tableName, String taskName, int taskIndex) { + Preconditions.checkNotNull(tableName, "Table name should no be null"); + Preconditions.checkNotNull(taskName, "Task name should no be null"); + + this.tableName = tableName; + this.taskName = taskName; + this.taskIndex = taskIndex; + this.exceptions = Lists.newArrayList(); + } + + @Override + public void processElement1(StreamRecord streamRecord) { + startTime = streamRecord.getValue().timestamp(); + } + + @Override + public void processElement2(StreamRecord streamRecord) { + Preconditions.checkNotNull(streamRecord.getValue(), "Exception could not be `null`."); + exceptions.add(streamRecord.getValue()); + } + + @Override + public void processWatermark(Watermark mark) throws Exception { + if (startTime != 0L) { + TaskResult response = new TaskResult(taskIndex, startTime, exceptions.isEmpty(), exceptions); + output.collect(new StreamRecord<>(response)); + LOG.info( + "Aggregated result for table {}, task {}[{}] is {}", + tableName, + taskName, + taskIndex, + response); + exceptions.clear(); + startTime = 0L; + } + + super.processWatermark(mark); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerEvaluator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerEvaluator.java new file mode 100644 index 000000000000..d448898bdfe6 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerEvaluator.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.io.Serializable; +import java.time.Duration; +import java.util.List; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Internal +public class TriggerEvaluator implements Serializable { + private static final Logger LOG = LoggerFactory.getLogger(TriggerEvaluator.class); + private final List predicates; + + private TriggerEvaluator(List predicates) { + Preconditions.checkArgument(!predicates.isEmpty(), "Provide at least 1 condition."); + + this.predicates = predicates; + } + + boolean check(TableChange event, long lastTimeMs, long currentTimeMs) { + boolean result = + predicates.stream().anyMatch(p -> p.evaluate(event, lastTimeMs, currentTimeMs)); + LOG.debug( + "Checking event: {}, at {}, last: {} with result: {}", + event, + currentTimeMs, + lastTimeMs, + result); + return result; + } + + public static class Builder implements Serializable { + private Integer dataFileCount; + private Long dataFileSizeInBytes; + private Integer posDeleteFileCount; + private Long posDeleteRecordCount; + private Integer eqDeleteFileCount; + private Long eqDeleteRecordCount; + private Integer commitCount; + private Duration timeout; + + public Builder dataFileCount(int newDataFileCount) { + this.dataFileCount = newDataFileCount; + return this; + } + + public Builder dataFileSizeInBytes(long neDataFileSizeInBytes) { + this.dataFileSizeInBytes = neDataFileSizeInBytes; + return this; + } + + public Builder posDeleteFileCount(int newPosDeleteFileCount) { + this.posDeleteFileCount = newPosDeleteFileCount; + return this; + } + + public Builder posDeleteRecordCount(long newPosDeleteRecordCount) { + this.posDeleteRecordCount = newPosDeleteRecordCount; + return this; + } + + public Builder eqDeleteFileCount(int newEqDeleteFileCount) { + this.eqDeleteFileCount = newEqDeleteFileCount; + return this; + } + + public Builder eqDeleteRecordCount(long newEqDeleteRecordCount) { + this.eqDeleteRecordCount = newEqDeleteRecordCount; + return this; + } + + public Builder commitCount(int newCommitCount) { + this.commitCount = newCommitCount; + return this; + } + + public Builder timeout(Duration newTimeout) { + this.timeout = newTimeout; + return this; + } + + public TriggerEvaluator build() { + List predicates = Lists.newArrayList(); + if (dataFileCount != null) { + predicates.add((change, unused, unused2) -> change.dataFileCount() >= dataFileCount); + } + + if (dataFileSizeInBytes != null) { + predicates.add( + (change, unused, unused2) -> change.dataFileSizeInBytes() >= dataFileSizeInBytes); + } + + if (posDeleteFileCount != null) { + predicates.add( + (change, unused, unused2) -> change.posDeleteFileCount() >= posDeleteFileCount); + } + + if (posDeleteRecordCount != null) { + predicates.add( + (change, unused, unused2) -> change.posDeleteRecordCount() >= posDeleteRecordCount); + } + + if (eqDeleteFileCount != null) { + predicates.add( + (change, unused, unused2) -> change.eqDeleteFileCount() >= eqDeleteFileCount); + } + + if (eqDeleteRecordCount != null) { + predicates.add( + (change, unused, unused2) -> change.eqDeleteRecordCount() >= eqDeleteRecordCount); + } + + if (commitCount != null) { + predicates.add((change, unused, unused2) -> change.commitCount() >= commitCount); + } + + if (timeout != null) { + predicates.add( + (change, lastTimeMs, currentTimeMs) -> + currentTimeMs - lastTimeMs >= timeout.toMillis()); + } + + return new TriggerEvaluator(predicates); + } + } + + private interface Predicate extends Serializable { + boolean evaluate(TableChange event, long lastTimeMs, long currentTimeMs); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerManager.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerManager.java new file mode 100644 index 000000000000..f1f2b51c0943 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerManager.java @@ -0,0 +1,327 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.io.IOException; +import java.util.List; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.common.state.ValueState; +import org.apache.flink.api.common.state.ValueStateDescriptor; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.metrics.Counter; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.runtime.state.FunctionInitializationContext; +import org.apache.flink.runtime.state.FunctionSnapshotContext; +import org.apache.flink.streaming.api.TimerService; +import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; +import org.apache.flink.streaming.api.functions.KeyedProcessFunction; +import org.apache.flink.util.Collector; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.apache.iceberg.flink.maintenance.api.TriggerLockFactory; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * TriggerManager starts the Maintenance Tasks by emitting {@link Trigger} messages which are + * calculated based on the incoming {@link TableChange} messages. The TriggerManager keeps track of + * the changes since the last run of the Maintenance Tasks and triggers a new run based on the + * result of the {@link TriggerEvaluator}. + * + *

The TriggerManager prevents overlapping Maintenance Task runs using {@link + * TriggerLockFactory.Lock}. The current implementation only handles conflicts within a single job. + * Users should avoid scheduling maintenance for the same table in different Flink jobs. + * + *

The TriggerManager should run as a global operator. {@link KeyedProcessFunction} is used, so + * the timer functions are available, but the key is not used. + */ +@Internal +public class TriggerManager extends KeyedProcessFunction + implements CheckpointedFunction { + private static final Logger LOG = LoggerFactory.getLogger(TriggerManager.class); + + private final String tableName; + private final TriggerLockFactory lockFactory; + private final List maintenanceTaskNames; + private final List evaluators; + private final long minFireDelayMs; + private final long lockCheckDelayMs; + private transient Counter rateLimiterTriggeredCounter; + private transient Counter concurrentRunThrottledCounter; + private transient Counter nothingToTriggerCounter; + private transient List triggerCounters; + private transient ValueState nextEvaluationTimeState; + private transient ListState accumulatedChangesState; + private transient ListState lastTriggerTimesState; + private transient Long nextEvaluationTime; + private transient List accumulatedChanges; + private transient List lastTriggerTimes; + private transient TriggerLockFactory.Lock lock; + private transient TriggerLockFactory.Lock recoveryLock; + private transient boolean shouldRestoreTasks = false; + private transient boolean inited = false; + // To keep the task scheduling fair we keep the last triggered task position in memory. + // If we find a task to trigger, then we run it, but after it is finished, we start from the given + // position to prevent "starvation" of the tasks. + // When there is nothing to trigger, we start from the beginning, as the order of the tasks might + // be important (RewriteDataFiles first, and then RewriteManifestFiles later) + private transient int startsFrom = 0; + private transient boolean triggered = false; + + public TriggerManager( + TableLoader tableLoader, + TriggerLockFactory lockFactory, + List maintenanceTaskNames, + List evaluators, + long minFireDelayMs, + long lockCheckDelayMs) { + Preconditions.checkNotNull(tableLoader, "Table loader should no be null"); + Preconditions.checkNotNull(lockFactory, "Lock factory should no be null"); + Preconditions.checkArgument( + maintenanceTaskNames != null && !maintenanceTaskNames.isEmpty(), + "Invalid maintenance task names: null or empty"); + Preconditions.checkArgument( + evaluators != null && !evaluators.isEmpty(), "Invalid evaluators: null or empty"); + Preconditions.checkArgument( + maintenanceTaskNames.size() == evaluators.size(), + "Provide a name and evaluator for all of the maintenance tasks"); + Preconditions.checkArgument(minFireDelayMs > 0, "Minimum fire delay should be at least 1."); + Preconditions.checkArgument( + lockCheckDelayMs > 0, "Minimum lock delay rate should be at least 1 ms."); + + tableLoader.open(); + this.tableName = tableLoader.loadTable().name(); + this.lockFactory = lockFactory; + this.maintenanceTaskNames = maintenanceTaskNames; + this.evaluators = evaluators; + this.minFireDelayMs = minFireDelayMs; + this.lockCheckDelayMs = lockCheckDelayMs; + } + + @Override + public void open(OpenContext parameters) throws Exception { + MetricGroup mainGroup = TableMaintenanceMetrics.groupFor(getRuntimeContext(), tableName); + this.rateLimiterTriggeredCounter = + mainGroup.counter(TableMaintenanceMetrics.RATE_LIMITER_TRIGGERED); + this.concurrentRunThrottledCounter = + mainGroup.counter(TableMaintenanceMetrics.CONCURRENT_RUN_THROTTLED); + this.nothingToTriggerCounter = mainGroup.counter(TableMaintenanceMetrics.NOTHING_TO_TRIGGER); + this.triggerCounters = Lists.newArrayListWithExpectedSize(maintenanceTaskNames.size()); + for (int taskIndex = 0; taskIndex < maintenanceTaskNames.size(); ++taskIndex) { + triggerCounters.add( + TableMaintenanceMetrics.groupFor( + mainGroup, maintenanceTaskNames.get(taskIndex), taskIndex) + .counter(TableMaintenanceMetrics.TRIGGERED)); + } + + this.nextEvaluationTimeState = + getRuntimeContext() + .getState(new ValueStateDescriptor<>("triggerManagerNextTriggerTime", Types.LONG)); + this.accumulatedChangesState = + getRuntimeContext() + .getListState( + new ListStateDescriptor<>( + "triggerManagerAccumulatedChange", TypeInformation.of(TableChange.class))); + this.lastTriggerTimesState = + getRuntimeContext() + .getListState(new ListStateDescriptor<>("triggerManagerLastTriggerTime", Types.LONG)); + } + + @Override + public void snapshotState(FunctionSnapshotContext context) throws Exception { + if (inited) { + // Only store state if initialized + nextEvaluationTimeState.update(nextEvaluationTime); + accumulatedChangesState.update(accumulatedChanges); + lastTriggerTimesState.update(lastTriggerTimes); + LOG.info( + "Storing state: nextEvaluationTime {}, accumulatedChanges {}, lastTriggerTimes {}", + nextEvaluationTime, + accumulatedChanges, + lastTriggerTimes); + } else { + LOG.info("Not initialized, state is not stored"); + } + } + + @Override + public void initializeState(FunctionInitializationContext context) throws Exception { + LOG.info("Initializing state restored: {}", context.isRestored()); + lockFactory.open(); + this.lock = lockFactory.createLock(); + this.recoveryLock = lockFactory.createRecoveryLock(); + if (context.isRestored()) { + shouldRestoreTasks = true; + } else { + lock.unlock(); + recoveryLock.unlock(); + } + } + + @Override + public void processElement(TableChange change, Context ctx, Collector out) + throws Exception { + init(out, ctx.timerService()); + + accumulatedChanges.forEach(tableChange -> tableChange.merge(change)); + + long current = ctx.timerService().currentProcessingTime(); + if (nextEvaluationTime == null) { + checkAndFire(current, ctx.timerService(), out); + } else { + LOG.info( + "Trigger manager rate limiter triggered current: {}, next: {}, accumulated changes: {}", + current, + nextEvaluationTime, + accumulatedChanges); + rateLimiterTriggeredCounter.inc(); + } + } + + @Override + public void onTimer(long timestamp, OnTimerContext ctx, Collector out) throws Exception { + init(out, ctx.timerService()); + this.nextEvaluationTime = null; + checkAndFire(ctx.timerService().currentProcessingTime(), ctx.timerService(), out); + } + + @Override + public void close() throws IOException { + lockFactory.close(); + } + + private void checkAndFire(long current, TimerService timerService, Collector out) { + if (shouldRestoreTasks) { + if (recoveryLock.isHeld()) { + // Recovered tasks in progress. Skip trigger check + LOG.debug("The recovery lock is still held at {}", current); + schedule(timerService, current + lockCheckDelayMs); + return; + } else { + LOG.info("The recovery is finished at {}", current); + shouldRestoreTasks = false; + } + } + + Integer taskToStart = + nextTrigger(evaluators, accumulatedChanges, lastTriggerTimes, current, startsFrom); + if (taskToStart == null) { + // Nothing to execute + if (!triggered) { + nothingToTriggerCounter.inc(); + LOG.debug("Nothing to execute at {} for collected: {}", current, accumulatedChanges); + } else { + LOG.debug("Execution check finished"); + } + + // Next time start from the beginning + startsFrom = 0; + triggered = false; + return; + } + + if (lock.tryLock()) { + TableChange change = accumulatedChanges.get(taskToStart); + out.collect(Trigger.create(current, taskToStart)); + LOG.debug("Fired event with time: {}, collected: {} for {}", current, change, tableName); + triggerCounters.get(taskToStart).inc(); + accumulatedChanges.set(taskToStart, TableChange.empty()); + lastTriggerTimes.set(taskToStart, current); + schedule(timerService, current + minFireDelayMs); + startsFrom = (taskToStart + 1) % evaluators.size(); + triggered = true; + } else { + // A task is already running, waiting for it to finish + LOG.info("Failed to acquire lock. Delaying task to {}", current + lockCheckDelayMs); + + startsFrom = taskToStart; + concurrentRunThrottledCounter.inc(); + schedule(timerService, current + lockCheckDelayMs); + } + + timerService.registerProcessingTimeTimer(nextEvaluationTime); + } + + private void schedule(TimerService timerService, long time) { + this.nextEvaluationTime = time; + timerService.registerProcessingTimeTimer(time); + } + + private static Integer nextTrigger( + List evaluators, + List changes, + List lastTriggerTimes, + long currentTime, + int startPos) { + int current = startPos; + do { + if (evaluators + .get(current) + .check(changes.get(current), lastTriggerTimes.get(current), currentTime)) { + return current; + } + + current = (current + 1) % evaluators.size(); + } while (current != startPos); + + return null; + } + + private void init(Collector out, TimerService timerService) throws Exception { + if (!inited) { + long current = timerService.currentProcessingTime(); + + // Initialize from state + this.nextEvaluationTime = nextEvaluationTimeState.value(); + this.accumulatedChanges = Lists.newArrayList(accumulatedChangesState.get()); + this.lastTriggerTimes = Lists.newArrayList(lastTriggerTimesState.get()); + + // Initialize if the state was empty + if (accumulatedChanges.isEmpty()) { + for (int i = 0; i < evaluators.size(); ++i) { + accumulatedChanges.add(TableChange.empty()); + lastTriggerTimes.add(current); + } + } + + if (shouldRestoreTasks) { + // When the job state is restored, there could be ongoing tasks. + // To prevent collision with the new triggers the following is done: + // - add a recovery lock + // - fire a recovery trigger + // This ensures that the tasks of the previous trigger are executed, and the lock is removed + // in the end. The result of the 'tryLock' is ignored as an already existing lock prevents + // collisions as well. + recoveryLock.tryLock(); + out.collect(Trigger.recovery(current)); + if (nextEvaluationTime == null) { + schedule(timerService, current + minFireDelayMs); + } + } + + inited = true; + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java new file mode 100644 index 000000000000..f7e8e0c884cf --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.formats.avro.AvroToRowDataConverters; +import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.iceberg.avro.AvroSchemaUtil; + +/** + * This util class converts Avro GenericRecord to Flink RowData.
+ *
+ * Internally it uses Flink {@link AvroToRowDataConverters}. Because of the precision difference + * between how Iceberg schema (micro) and Flink {@link AvroToRowDataConverters} (milli) deal with + * time type, we can't directly use the Avro Schema converted from Iceberg schema via {@link + * AvroSchemaUtil#convert(org.apache.iceberg.Schema, String)}. + */ +public class AvroGenericRecordToRowDataMapper implements MapFunction { + + private final AvroToRowDataConverters.AvroToRowDataConverter converter; + + AvroGenericRecordToRowDataMapper(RowType rowType) { + this.converter = AvroToRowDataConverters.createRowConverter(rowType); + } + + @Override + public RowData map(GenericRecord genericRecord) throws Exception { + return (RowData) converter.convert(genericRecord); + } + + /** Create a mapper based on Avro schema. */ + public static AvroGenericRecordToRowDataMapper forAvroSchema(Schema avroSchema) { + DataType dataType = AvroSchemaConverter.convertToDataType(avroSchema.toString()); + LogicalType logicalType = TypeConversions.fromDataToLogicalType(dataType); + RowType rowType = RowType.of(logicalType.getChildren().toArray(new LogicalType[0])); + return new AvroGenericRecordToRowDataMapper(rowType); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java new file mode 100644 index 000000000000..d845046cd2f6 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.IOException; +import java.util.Set; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.deletes.DeleteGranularity; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.data.RowDataProjection; +import org.apache.iceberg.io.BaseTaskWriter; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.OutputFileFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; + +abstract class BaseDeltaTaskWriter extends BaseTaskWriter { + + private final Schema schema; + private final Schema deleteSchema; + private final RowDataWrapper wrapper; + private final RowDataWrapper keyWrapper; + private final RowDataProjection keyProjection; + private final boolean upsert; + + BaseDeltaTaskWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema, + Set equalityFieldIds, + boolean upsert) { + super(spec, format, appenderFactory, fileFactory, io, targetFileSize); + this.schema = schema; + this.deleteSchema = TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)); + this.wrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); + this.keyWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(deleteSchema), deleteSchema.asStruct()); + this.keyProjection = + RowDataProjection.create(flinkSchema, schema.asStruct(), deleteSchema.asStruct()); + this.upsert = upsert; + } + + abstract RowDataDeltaWriter route(RowData row); + + RowDataWrapper wrapper() { + return wrapper; + } + + @Override + public void write(RowData row) throws IOException { + RowDataDeltaWriter writer = route(row); + + switch (row.getRowKind()) { + case INSERT: + case UPDATE_AFTER: + if (upsert) { + writer.deleteKey(keyProjection.wrap(row)); + } + writer.write(row); + break; + + case UPDATE_BEFORE: + if (upsert) { + break; // UPDATE_BEFORE is not necessary for UPSERT, we do nothing to prevent delete one + // row twice + } + writer.delete(row); + break; + case DELETE: + if (upsert) { + writer.deleteKey(keyProjection.wrap(row)); + } else { + writer.delete(row); + } + break; + + default: + throw new UnsupportedOperationException("Unknown row kind: " + row.getRowKind()); + } + } + + protected class RowDataDeltaWriter extends BaseEqualityDeltaWriter { + RowDataDeltaWriter(PartitionKey partition) { + super(partition, schema, deleteSchema, DeleteGranularity.FILE); + } + + @Override + protected StructLike asStructLike(RowData data) { + return wrapper.wrap(data); + } + + @Override + protected StructLike asStructLikeKey(RowData data) { + return keyWrapper.wrap(data); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java new file mode 100644 index 000000000000..1cb6e013bd2c --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.stream.IntStream; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.RowDataWrapper; + +/** + * A {@link KeySelector} that extracts the bucketId from a data row's bucket partition as the key. + * To be used with the {@link BucketPartitioner}. + */ +class BucketPartitionKeySelector implements KeySelector { + + private final Schema schema; + private final PartitionKey partitionKey; + private final RowType flinkSchema; + private final int bucketFieldPosition; + + private transient RowDataWrapper rowDataWrapper; + + BucketPartitionKeySelector(PartitionSpec partitionSpec, Schema schema, RowType flinkSchema) { + this.schema = schema; + this.partitionKey = new PartitionKey(partitionSpec, schema); + this.flinkSchema = flinkSchema; + this.bucketFieldPosition = getBucketFieldPosition(partitionSpec); + } + + private int getBucketFieldPosition(PartitionSpec partitionSpec) { + int bucketFieldId = BucketPartitionerUtil.getBucketFieldId(partitionSpec); + return IntStream.range(0, partitionSpec.fields().size()) + .filter(i -> partitionSpec.fields().get(i).fieldId() == bucketFieldId) + .toArray()[0]; + } + + private RowDataWrapper lazyRowDataWrapper() { + if (rowDataWrapper == null) { + rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); + } + + return rowDataWrapper; + } + + @Override + public Integer getKey(RowData rowData) { + partitionKey.partition(lazyRowDataWrapper().wrap(rowData)); + return partitionKey.get(bucketFieldPosition, Integer.class); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java new file mode 100644 index 000000000000..9c9a117906e2 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import org.apache.flink.api.common.functions.Partitioner; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * This partitioner will redirect records to writers deterministically based on the Bucket partition + * spec. It'll attempt to optimize the file size written depending on whether numPartitions is + * greater, less or equal than the maxNumBuckets. Note: The current implementation only supports ONE + * bucket in the partition spec. + */ +class BucketPartitioner implements Partitioner { + + static final String BUCKET_NULL_MESSAGE = "bucketId cannot be null"; + static final String BUCKET_LESS_THAN_LOWER_BOUND_MESSAGE = + "Invalid bucket ID %s: must be non-negative."; + static final String BUCKET_GREATER_THAN_UPPER_BOUND_MESSAGE = + "Invalid bucket ID %s: must be less than bucket limit: %s."; + + private final int maxNumBuckets; + + // To hold the OFFSET of the next writer to use for any bucket, only used when writers > the + // number of buckets + private final int[] currentBucketWriterOffset; + + BucketPartitioner(PartitionSpec partitionSpec) { + this.maxNumBuckets = BucketPartitionerUtil.getMaxNumBuckets(partitionSpec); + this.currentBucketWriterOffset = new int[maxNumBuckets]; + } + + /** + * Determine the partition id based on the following criteria: If the number of writers <= the + * number of buckets, an evenly distributed number of buckets will be assigned to each writer (one + * writer -> many buckets). Conversely, if the number of writers > the number of buckets the logic + * is handled by the {@link #getPartitionWithMoreWritersThanBuckets + * getPartitionWritersGreaterThanBuckets} method. + * + * @param bucketId the bucketId for each request + * @param numPartitions the total number of partitions + * @return the partition id (writer) to use for each request + */ + @Override + public int partition(Integer bucketId, int numPartitions) { + Preconditions.checkNotNull(bucketId, BUCKET_NULL_MESSAGE); + Preconditions.checkArgument(bucketId >= 0, BUCKET_LESS_THAN_LOWER_BOUND_MESSAGE, bucketId); + Preconditions.checkArgument( + bucketId < maxNumBuckets, BUCKET_GREATER_THAN_UPPER_BOUND_MESSAGE, bucketId, maxNumBuckets); + + if (numPartitions <= maxNumBuckets) { + return bucketId % numPartitions; + } else { + return getPartitionWithMoreWritersThanBuckets(bucketId, numPartitions); + } + } + + /*- + * If the number of writers > the number of buckets each partitioner will keep a state of multiple + * writers per bucket as evenly as possible, and will round-robin the requests across them, in this + * case each writer will target only one bucket at all times (many writers -> one bucket). Example: + * Configuration: numPartitions (writers) = 5, maxBuckets = 2 + * Expected behavior: + * - Records for Bucket 0 will be "round robin" between Writers 0, 2 and 4 + * - Records for Bucket 1 will always use Writer 1 and 3 + * Notes: + * - maxNumWritersPerBucket determines when to reset the currentBucketWriterOffset to 0 for this bucketId + * - When numPartitions is not evenly divisible by maxBuckets, some buckets will have one more writer (extraWriter). + * In this example Bucket 0 has an "extra writer" to consider before resetting its offset to 0. + * + * @return the destination partition index (writer subtask id) + */ + private int getPartitionWithMoreWritersThanBuckets(int bucketId, int numPartitions) { + int currentOffset = currentBucketWriterOffset[bucketId]; + // Determine if this bucket requires an "extra writer" + int extraWriter = bucketId < (numPartitions % maxNumBuckets) ? 1 : 0; + // The max number of writers this bucket can have + int maxNumWritersPerBucket = (numPartitions / maxNumBuckets) + extraWriter; + + // Increment the writer offset or reset if it's reached the max for this bucket + int nextOffset = currentOffset == maxNumWritersPerBucket - 1 ? 0 : currentOffset + 1; + currentBucketWriterOffset[bucketId] = nextOffset; + + return bucketId + (maxNumBuckets * currentOffset); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java new file mode 100644 index 000000000000..c33207728d3e --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.transforms.PartitionSpecVisitor; + +final class BucketPartitionerUtil { + static final String BAD_NUMBER_OF_BUCKETS_ERROR_MESSAGE = + "Invalid number of buckets: %s (must be 1)"; + + private BucketPartitionerUtil() {} + + /** + * Determines whether the PartitionSpec has one and only one Bucket definition + * + * @param partitionSpec the partition spec in question + * @return whether the PartitionSpec has only one Bucket + */ + static boolean hasOneBucketField(PartitionSpec partitionSpec) { + List> bucketFields = getBucketFields(partitionSpec); + return bucketFields != null && bucketFields.size() == 1; + } + + /** + * Extracts the Bucket definition from a PartitionSpec. + * + * @param partitionSpec the partition spec in question + * @return the Bucket definition in the form of a tuple (fieldId, maxNumBuckets) + */ + private static Tuple2 getBucketFieldInfo(PartitionSpec partitionSpec) { + List> bucketFields = getBucketFields(partitionSpec); + Preconditions.checkArgument( + bucketFields.size() == 1, + BucketPartitionerUtil.BAD_NUMBER_OF_BUCKETS_ERROR_MESSAGE, + bucketFields.size()); + return bucketFields.get(0); + } + + static int getBucketFieldId(PartitionSpec partitionSpec) { + return getBucketFieldInfo(partitionSpec).f0; + } + + static int getMaxNumBuckets(PartitionSpec partitionSpec) { + return getBucketFieldInfo(partitionSpec).f1; + } + + private static List> getBucketFields(PartitionSpec spec) { + return PartitionSpecVisitor.visit(spec, new BucketPartitionSpecVisitor()).stream() + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + + private static class BucketPartitionSpecVisitor + implements PartitionSpecVisitor> { + @Override + public Tuple2 identity(int fieldId, String sourceName, int sourceId) { + return null; + } + + @Override + public Tuple2 bucket( + int fieldId, String sourceName, int sourceId, int numBuckets) { + return new Tuple2<>(fieldId, numBuckets); + } + + @Override + public Tuple2 truncate( + int fieldId, String sourceName, int sourceId, int width) { + return null; + } + + @Override + public Tuple2 year(int fieldId, String sourceName, int sourceId) { + return null; + } + + @Override + public Tuple2 month(int fieldId, String sourceName, int sourceId) { + return null; + } + + @Override + public Tuple2 day(int fieldId, String sourceName, int sourceId) { + return null; + } + + @Override + public Tuple2 hour(int fieldId, String sourceName, int sourceId) { + return null; + } + + @Override + public Tuple2 alwaysNull(int fieldId, String sourceName, int sourceId) { + return null; + } + + @Override + public Tuple2 unknown( + int fieldId, String sourceName, int sourceId, String transform) { + return null; + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java new file mode 100644 index 000000000000..0afc07cc1977 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.time.Duration; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.util.DateTimeUtil; +import org.apache.iceberg.util.SerializableSupplier; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A table loader that will only reload a table after a certain interval has passed. WARNING: This + * table loader should be used carefully when used with writer tasks. It could result in heavy load + * on a catalog for jobs with many writers. + */ +class CachingTableSupplier implements SerializableSupplier { + + private static final Logger LOG = LoggerFactory.getLogger(CachingTableSupplier.class); + + private final Table initialTable; + private final TableLoader tableLoader; + private final Duration tableRefreshInterval; + private long lastLoadTimeMillis; + private transient Table table; + + CachingTableSupplier( + SerializableTable initialTable, TableLoader tableLoader, Duration tableRefreshInterval) { + Preconditions.checkArgument(initialTable != null, "initialTable cannot be null"); + Preconditions.checkArgument(tableLoader != null, "tableLoader cannot be null"); + Preconditions.checkArgument( + tableRefreshInterval != null, "tableRefreshInterval cannot be null"); + this.initialTable = initialTable; + this.table = initialTable; + this.tableLoader = tableLoader; + this.tableRefreshInterval = tableRefreshInterval; + this.lastLoadTimeMillis = System.currentTimeMillis(); + } + + @Override + public Table get() { + if (table == null) { + this.table = initialTable; + } + return table; + } + + Table initialTable() { + return initialTable; + } + + void refreshTable() { + if (System.currentTimeMillis() > lastLoadTimeMillis + tableRefreshInterval.toMillis()) { + try { + if (!tableLoader.isOpen()) { + tableLoader.open(); + } + + this.table = tableLoader.loadTable(); + this.lastLoadTimeMillis = System.currentTimeMillis(); + + LOG.info( + "Table {} reloaded, next min load time threshold is {}", + table.name(), + DateTimeUtil.formatTimestampMillis( + lastLoadTimeMillis + tableRefreshInterval.toMillis())); + } catch (Exception e) { + LOG.warn("An error occurred reloading table {}, table was not reloaded", table.name(), e); + } + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java new file mode 100644 index 000000000000..1b786e46452f --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.Arrays; +import java.util.List; +import java.util.NavigableMap; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.util.ScanTaskUtil; + +@Internal +public class CommitSummary { + + private final AtomicLong dataFilesCount = new AtomicLong(); + private final AtomicLong dataFilesRecordCount = new AtomicLong(); + private final AtomicLong dataFilesByteCount = new AtomicLong(); + private final AtomicLong deleteFilesCount = new AtomicLong(); + private final AtomicLong deleteFilesRecordCount = new AtomicLong(); + private final AtomicLong deleteFilesByteCount = new AtomicLong(); + + public CommitSummary() {} + + public CommitSummary(NavigableMap pendingResults) { + pendingResults.values().forEach(this::addWriteResult); + } + + public void addAll(NavigableMap> pendingResults) { + pendingResults.values().forEach(writeResults -> writeResults.forEach(this::addWriteResult)); + } + + private void addWriteResult(WriteResult writeResult) { + dataFilesCount.addAndGet(writeResult.dataFiles().length); + Arrays.stream(writeResult.dataFiles()) + .forEach( + dataFile -> { + dataFilesRecordCount.addAndGet(dataFile.recordCount()); + dataFilesByteCount.addAndGet(dataFile.fileSizeInBytes()); + }); + deleteFilesCount.addAndGet(writeResult.deleteFiles().length); + Arrays.stream(writeResult.deleteFiles()) + .forEach( + deleteFile -> { + deleteFilesRecordCount.addAndGet(deleteFile.recordCount()); + long deleteBytes = ScanTaskUtil.contentSizeInBytes(deleteFile); + deleteFilesByteCount.addAndGet(deleteBytes); + }); + } + + public long dataFilesCount() { + return dataFilesCount.get(); + } + + long dataFilesRecordCount() { + return dataFilesRecordCount.get(); + } + + long dataFilesByteCount() { + return dataFilesByteCount.get(); + } + + public long deleteFilesCount() { + return deleteFilesCount.get(); + } + + long deleteFilesRecordCount() { + return deleteFilesRecordCount.get(); + } + + long deleteFilesByteCount() { + return deleteFilesByteCount.get(); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("dataFilesCount", dataFilesCount) + .add("dataFilesRecordCount", dataFilesRecordCount) + .add("dataFilesByteCount", dataFilesByteCount) + .add("deleteFilesCount", deleteFilesCount) + .add("deleteFilesRecordCount", deleteFilesRecordCount) + .add("deleteFilesByteCount", deleteFilesByteCount) + .toString(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/CommittableToTableChangeConverter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/CommittableToTableChangeConverter.java new file mode 100644 index 000000000000..1369d98e432b --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/CommittableToTableChangeConverter.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.Arrays; +import java.util.Map; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.core.io.SimpleVersionedSerialization; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; +import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; +import org.apache.flink.streaming.api.functions.ProcessFunction; +import org.apache.flink.util.Collector; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.flink.maintenance.operator.TableChange; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Internal +public class CommittableToTableChangeConverter + extends ProcessFunction, TableChange> { + + private static final Logger LOG = + LoggerFactory.getLogger(CommittableToTableChangeConverter.class); + + private final FileIO io; + private final String tableName; + private final Map specs; + private transient String flinkJobId; + + public CommittableToTableChangeConverter( + FileIO fileIO, String tableName, Map specs) { + Preconditions.checkNotNull(fileIO, "FileIO should not be null"); + Preconditions.checkNotNull(tableName, "TableName should not be null"); + Preconditions.checkNotNull(specs, "Specs should not be null"); + this.io = fileIO; + this.tableName = tableName; + this.specs = specs; + } + + @Override + public void open(OpenContext openContext) throws Exception { + super.open(openContext); + Preconditions.checkState( + getRuntimeContext().getTaskInfo().getNumberOfParallelSubtasks() == 1, + "CommittableToTableChangeConverter must run with parallelism 1, current parallelism: %s", + getRuntimeContext().getTaskInfo().getNumberOfParallelSubtasks()); + + this.flinkJobId = getRuntimeContext().getJobInfo().getJobId().toString(); + } + + @Override + public void processElement( + CommittableMessage value, Context ctx, Collector out) + throws Exception { + if (value instanceof CommittableWithLineage) { + IcebergCommittable committable = + ((CommittableWithLineage) value).getCommittable(); + + if (committable == null || committable.manifest().length == 0) { + return; + } + + DeltaManifests deltaManifests; + WriteResult writeResult; + try { + deltaManifests = + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, committable.manifest()); + writeResult = FlinkManifestUtil.readCompletedFiles(deltaManifests, io, specs); + } catch (Exception e) { + LOG.warn( + "Unable to read delta manifests for table {} at checkpoint {}", + tableName, + committable.checkpointId(), + e); + return; + } + + TableChange tableChange = + new TableChange( + Arrays.asList(writeResult.dataFiles()), Arrays.asList(writeResult.deleteFiles())); + out.collect(tableChange); + FlinkManifestUtil.deleteCommittedManifests( + tableName, io, deltaManifests.manifests(), flinkJobId, committable.checkpointId()); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java new file mode 100644 index 000000000000..92c50165c0f5 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.List; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +@Internal +public class DeltaManifests { + + private static final CharSequence[] EMPTY_REF_DATA_FILES = new CharSequence[0]; + + private final ManifestFile dataManifest; + private final ManifestFile deleteManifest; + private final CharSequence[] referencedDataFiles; + + DeltaManifests(ManifestFile dataManifest, ManifestFile deleteManifest) { + this(dataManifest, deleteManifest, EMPTY_REF_DATA_FILES); + } + + DeltaManifests( + ManifestFile dataManifest, ManifestFile deleteManifest, CharSequence[] referencedDataFiles) { + Preconditions.checkNotNull(referencedDataFiles, "Referenced data files shouldn't be null."); + + this.dataManifest = dataManifest; + this.deleteManifest = deleteManifest; + this.referencedDataFiles = referencedDataFiles; + } + + ManifestFile dataManifest() { + return dataManifest; + } + + ManifestFile deleteManifest() { + return deleteManifest; + } + + CharSequence[] referencedDataFiles() { + return referencedDataFiles; + } + + public List manifests() { + List manifests = Lists.newArrayListWithCapacity(2); + if (dataManifest != null) { + manifests.add(dataManifest); + } + + if (deleteManifest != null) { + manifests.add(deleteManifest); + } + + return manifests; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java new file mode 100644 index 000000000000..6ad41bacf337 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import org.apache.flink.annotation.Internal; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.ManifestFiles; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +@Internal +public class DeltaManifestsSerializer implements SimpleVersionedSerializer { + private static final int VERSION_1 = 1; + private static final int VERSION_2 = 2; + private static final byte[] EMPTY_BINARY = new byte[0]; + + public static final DeltaManifestsSerializer INSTANCE = new DeltaManifestsSerializer(); + + @Override + public int getVersion() { + return VERSION_2; + } + + @Override + public byte[] serialize(DeltaManifests deltaManifests) throws IOException { + Preconditions.checkNotNull( + deltaManifests, "DeltaManifests to be serialized should not be null"); + + ByteArrayOutputStream binaryOut = new ByteArrayOutputStream(); + DataOutputStream out = new DataOutputStream(binaryOut); + + byte[] dataManifestBinary = EMPTY_BINARY; + if (deltaManifests.dataManifest() != null) { + dataManifestBinary = ManifestFiles.encode(deltaManifests.dataManifest()); + } + + out.writeInt(dataManifestBinary.length); + out.write(dataManifestBinary); + + byte[] deleteManifestBinary = EMPTY_BINARY; + if (deltaManifests.deleteManifest() != null) { + deleteManifestBinary = ManifestFiles.encode(deltaManifests.deleteManifest()); + } + + out.writeInt(deleteManifestBinary.length); + out.write(deleteManifestBinary); + + CharSequence[] referencedDataFiles = deltaManifests.referencedDataFiles(); + out.writeInt(referencedDataFiles.length); + for (CharSequence referencedDataFile : referencedDataFiles) { + out.writeUTF(referencedDataFile.toString()); + } + + return binaryOut.toByteArray(); + } + + @Override + public DeltaManifests deserialize(int version, byte[] serialized) throws IOException { + if (version == VERSION_1) { + return deserializeV1(serialized); + } else if (version == VERSION_2) { + return deserializeV2(serialized); + } else { + throw new RuntimeException("Unknown serialize version: " + version); + } + } + + private DeltaManifests deserializeV1(byte[] serialized) throws IOException { + return new DeltaManifests(ManifestFiles.decode(serialized), null); + } + + private DeltaManifests deserializeV2(byte[] serialized) throws IOException { + ManifestFile dataManifest = null; + ManifestFile deleteManifest = null; + + ByteArrayInputStream binaryIn = new ByteArrayInputStream(serialized); + DataInputStream in = new DataInputStream(binaryIn); + + int dataManifestSize = in.readInt(); + if (dataManifestSize > 0) { + byte[] dataManifestBinary = new byte[dataManifestSize]; + Preconditions.checkState(in.read(dataManifestBinary) == dataManifestSize); + + dataManifest = ManifestFiles.decode(dataManifestBinary); + } + + int deleteManifestSize = in.readInt(); + if (deleteManifestSize > 0) { + byte[] deleteManifestBinary = new byte[deleteManifestSize]; + Preconditions.checkState(in.read(deleteManifestBinary) == deleteManifestSize); + + deleteManifest = ManifestFiles.decode(deleteManifestBinary); + } + + int referenceDataFileNum = in.readInt(); + CharSequence[] referencedDataFiles = new CharSequence[referenceDataFileNum]; + for (int i = 0; i < referenceDataFileNum; i++) { + referencedDataFiles[i] = in.readUTF(); + } + + return new DeltaManifests(dataManifest, deleteManifest, referencedDataFiles); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java new file mode 100644 index 000000000000..92e47792c13b --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.Set; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.StructLikeWrapper; +import org.apache.iceberg.util.StructProjection; + +/** + * Create a {@link KeySelector} to shuffle by equality fields, to ensure same equality fields record + * will be emitted to same writer in order. + */ +@Internal +public class EqualityFieldKeySelector implements KeySelector { + + private final Schema schema; + private final RowType flinkSchema; + private final Schema deleteSchema; + + private transient RowDataWrapper rowDataWrapper; + private transient StructProjection structProjection; + private transient StructLikeWrapper structLikeWrapper; + + public EqualityFieldKeySelector( + Schema schema, RowType flinkSchema, Set equalityFieldIds) { + this.schema = schema; + this.flinkSchema = flinkSchema; + this.deleteSchema = TypeUtil.select(schema, equalityFieldIds); + } + + /** + * Construct the {@link RowDataWrapper} lazily here because few members in it are not + * serializable. In this way, we don't have to serialize them with forcing. + */ + protected RowDataWrapper lazyRowDataWrapper() { + if (rowDataWrapper == null) { + rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); + } + return rowDataWrapper; + } + + /** Construct the {@link StructProjection} lazily because it is not serializable. */ + protected StructProjection lazyStructProjection() { + if (structProjection == null) { + structProjection = StructProjection.create(schema, deleteSchema); + } + return structProjection; + } + + /** Construct the {@link StructLikeWrapper} lazily because it is not serializable. */ + protected StructLikeWrapper lazyStructLikeWrapper() { + if (structLikeWrapper == null) { + structLikeWrapper = StructLikeWrapper.forType(deleteSchema.asStruct()); + } + return structLikeWrapper; + } + + @Override + public Integer getKey(RowData row) { + RowDataWrapper wrappedRowData = lazyRowDataWrapper().wrap(row); + StructProjection projectedRowData = lazyStructProjection().wrap(wrappedRowData); + StructLikeWrapper wrapper = lazyStructLikeWrapper().set(projectedRowData); + return wrapper.hashCode(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java new file mode 100644 index 000000000000..b6f1392d1562 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.IOException; +import java.io.Serializable; +import java.io.UncheckedIOException; +import java.util.Map; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.Table; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.deletes.EqualityDeleteWriter; +import org.apache.iceberg.deletes.PositionDeleteWriter; +import org.apache.iceberg.encryption.EncryptedOutputFile; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.data.FlinkAvroWriter; +import org.apache.iceberg.flink.data.FlinkOrcWriter; +import org.apache.iceberg.flink.data.FlinkParquetWriters; +import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.io.DeleteSchemaUtil; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.orc.ORC; +import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +public class FlinkAppenderFactory implements FileAppenderFactory, Serializable { + private final Schema schema; + private final RowType flinkSchema; + private final Map props; + private final PartitionSpec spec; + private final int[] equalityFieldIds; + private final Schema eqDeleteRowSchema; + private final Schema posDeleteRowSchema; + private final Table table; + + private RowType eqDeleteFlinkSchema = null; + private RowType posDeleteFlinkSchema = null; + + public FlinkAppenderFactory( + Table table, + Schema schema, + RowType flinkSchema, + Map props, + PartitionSpec spec, + int[] equalityFieldIds, + Schema eqDeleteRowSchema, + Schema posDeleteRowSchema) { + Preconditions.checkNotNull(table, "Table shouldn't be null"); + this.table = table; + this.schema = schema; + this.flinkSchema = flinkSchema; + this.props = props; + this.spec = spec; + this.equalityFieldIds = equalityFieldIds; + this.eqDeleteRowSchema = eqDeleteRowSchema; + this.posDeleteRowSchema = posDeleteRowSchema; + } + + private RowType lazyEqDeleteFlinkSchema() { + if (eqDeleteFlinkSchema == null) { + Preconditions.checkNotNull(eqDeleteRowSchema, "Equality delete row schema shouldn't be null"); + this.eqDeleteFlinkSchema = FlinkSchemaUtil.convert(eqDeleteRowSchema); + } + return eqDeleteFlinkSchema; + } + + private RowType lazyPosDeleteFlinkSchema() { + if (posDeleteFlinkSchema == null) { + Preconditions.checkNotNull(posDeleteRowSchema, "Pos-delete row schema shouldn't be null"); + this.posDeleteFlinkSchema = FlinkSchemaUtil.convert(posDeleteRowSchema); + } + return this.posDeleteFlinkSchema; + } + + @Override + public FileAppender newAppender(OutputFile outputFile, FileFormat format) { + MetricsConfig metricsConfig = MetricsConfig.forTable(table); + try { + switch (format) { + case AVRO: + return Avro.write(outputFile) + .createWriterFunc(ignore -> new FlinkAvroWriter(flinkSchema)) + .setAll(props) + .schema(schema) + .metricsConfig(metricsConfig) + .overwrite() + .build(); + + case ORC: + return ORC.write(outputFile) + .createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) + .setAll(props) + .metricsConfig(metricsConfig) + .schema(schema) + .overwrite() + .build(); + + case PARQUET: + return Parquet.write(outputFile) + .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(flinkSchema, msgType)) + .setAll(props) + .metricsConfig(metricsConfig) + .schema(schema) + .overwrite() + .build(); + + default: + throw new UnsupportedOperationException("Cannot write unknown file format: " + format); + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @Override + public DataWriter newDataWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { + return new DataWriter<>( + newAppender(file.encryptingOutputFile(), format), + format, + file.encryptingOutputFile().location(), + spec, + partition, + file.keyMetadata()); + } + + @Override + public EqualityDeleteWriter newEqDeleteWriter( + EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { + Preconditions.checkState( + equalityFieldIds != null && equalityFieldIds.length > 0, + "Equality field ids shouldn't be null or empty when creating equality-delete writer"); + Preconditions.checkNotNull( + eqDeleteRowSchema, + "Equality delete row schema shouldn't be null when creating equality-delete writer"); + + MetricsConfig metricsConfig = MetricsConfig.forTable(table); + try { + switch (format) { + case AVRO: + return Avro.writeDeletes(outputFile.encryptingOutputFile()) + .createWriterFunc(ignore -> new FlinkAvroWriter(lazyEqDeleteFlinkSchema())) + .withPartition(partition) + .overwrite() + .setAll(props) + .metricsConfig(metricsConfig) + .rowSchema(eqDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(outputFile.keyMetadata()) + .equalityFieldIds(equalityFieldIds) + .buildEqualityWriter(); + + case ORC: + return ORC.writeDeletes(outputFile.encryptingOutputFile()) + .createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) + .withPartition(partition) + .overwrite() + .setAll(props) + .metricsConfig(metricsConfig) + .rowSchema(eqDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(outputFile.keyMetadata()) + .equalityFieldIds(equalityFieldIds) + .buildEqualityWriter(); + + case PARQUET: + return Parquet.writeDeletes(outputFile.encryptingOutputFile()) + .createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(lazyEqDeleteFlinkSchema(), msgType)) + .withPartition(partition) + .overwrite() + .setAll(props) + .metricsConfig(metricsConfig) + .rowSchema(eqDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(outputFile.keyMetadata()) + .equalityFieldIds(equalityFieldIds) + .buildEqualityWriter(); + + default: + throw new UnsupportedOperationException( + "Cannot write equality-deletes for unsupported file format: " + format); + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @Override + public PositionDeleteWriter newPosDeleteWriter( + EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { + MetricsConfig metricsConfig = MetricsConfig.forPositionDelete(table); + try { + switch (format) { + case AVRO: + return Avro.writeDeletes(outputFile.encryptingOutputFile()) + .createWriterFunc(ignore -> new FlinkAvroWriter(lazyPosDeleteFlinkSchema())) + .withPartition(partition) + .overwrite() + .setAll(props) + .metricsConfig(metricsConfig) + .rowSchema(posDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(outputFile.keyMetadata()) + .buildPositionWriter(); + + case ORC: + RowType orcPosDeleteSchema = + FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); + return ORC.writeDeletes(outputFile.encryptingOutputFile()) + .createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(orcPosDeleteSchema, iSchema)) + .withPartition(partition) + .overwrite() + .setAll(props) + .metricsConfig(metricsConfig) + .rowSchema(posDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(outputFile.keyMetadata()) + .transformPaths(path -> StringData.fromString(path.toString())) + .buildPositionWriter(); + + case PARQUET: + RowType flinkPosDeleteSchema = + FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); + return Parquet.writeDeletes(outputFile.encryptingOutputFile()) + .createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(flinkPosDeleteSchema, msgType)) + .withPartition(partition) + .overwrite() + .setAll(props) + .metricsConfig(metricsConfig) + .rowSchema(posDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(outputFile.keyMetadata()) + .transformPaths(path -> StringData.fromString(path.toString())) + .buildPositionWriter(); + + default: + throw new UnsupportedOperationException( + "Cannot write pos-deletes for unsupported file format: " + format); + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java new file mode 100644 index 000000000000..2183fe062af4 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java @@ -0,0 +1,293 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; +import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; + +import java.io.Serializable; +import java.util.Map; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.Table; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.data.BaseFileWriterFactory; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.data.FlinkAvroWriter; +import org.apache.iceberg.flink.data.FlinkOrcWriter; +import org.apache.iceberg.flink.data.FlinkParquetWriters; +import org.apache.iceberg.io.DeleteSchemaUtil; +import org.apache.iceberg.orc.ORC; +import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +class FlinkFileWriterFactory extends BaseFileWriterFactory implements Serializable { + private RowType dataFlinkType; + private RowType equalityDeleteFlinkType; + private RowType positionDeleteFlinkType; + + FlinkFileWriterFactory( + Table table, + FileFormat dataFileFormat, + Schema dataSchema, + RowType dataFlinkType, + SortOrder dataSortOrder, + FileFormat deleteFileFormat, + int[] equalityFieldIds, + Schema equalityDeleteRowSchema, + RowType equalityDeleteFlinkType, + SortOrder equalityDeleteSortOrder, + Schema positionDeleteRowSchema, + RowType positionDeleteFlinkType) { + + super( + table, + dataFileFormat, + dataSchema, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteSortOrder, + positionDeleteRowSchema); + + this.dataFlinkType = dataFlinkType; + this.equalityDeleteFlinkType = equalityDeleteFlinkType; + this.positionDeleteFlinkType = positionDeleteFlinkType; + } + + static Builder builderFor(Table table) { + return new Builder(table); + } + + @Override + protected void configureDataWrite(Avro.DataWriteBuilder builder) { + builder.createWriterFunc(ignore -> new FlinkAvroWriter(dataFlinkType())); + } + + @Override + protected void configureEqualityDelete(Avro.DeleteWriteBuilder builder) { + builder.createWriterFunc(ignored -> new FlinkAvroWriter(equalityDeleteFlinkType())); + } + + @Override + protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) { + int rowFieldIndex = positionDeleteFlinkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME); + if (rowFieldIndex >= 0) { + // FlinkAvroWriter accepts just the Flink type of the row ignoring the path and pos + RowType positionDeleteRowFlinkType = + (RowType) positionDeleteFlinkType().getTypeAt(rowFieldIndex); + builder.createWriterFunc(ignored -> new FlinkAvroWriter(positionDeleteRowFlinkType)); + } + } + + @Override + protected void configureDataWrite(Parquet.DataWriteBuilder builder) { + builder.createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(dataFlinkType(), msgType)); + } + + @Override + protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) { + builder.createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(equalityDeleteFlinkType(), msgType)); + } + + @Override + protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) { + builder.createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(positionDeleteFlinkType(), msgType)); + builder.transformPaths(path -> StringData.fromString(path.toString())); + } + + @Override + protected void configureDataWrite(ORC.DataWriteBuilder builder) { + builder.createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(dataFlinkType(), iSchema)); + } + + @Override + protected void configureEqualityDelete(ORC.DeleteWriteBuilder builder) { + builder.createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(equalityDeleteFlinkType(), iSchema)); + } + + @Override + protected void configurePositionDelete(ORC.DeleteWriteBuilder builder) { + builder.createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(positionDeleteFlinkType(), iSchema)); + builder.transformPaths(path -> StringData.fromString(path.toString())); + } + + private RowType dataFlinkType() { + if (dataFlinkType == null) { + Preconditions.checkNotNull(dataSchema(), "Data schema must not be null"); + this.dataFlinkType = FlinkSchemaUtil.convert(dataSchema()); + } + + return dataFlinkType; + } + + private RowType equalityDeleteFlinkType() { + if (equalityDeleteFlinkType == null) { + Preconditions.checkNotNull( + equalityDeleteRowSchema(), "Equality delete schema must not be null"); + this.equalityDeleteFlinkType = FlinkSchemaUtil.convert(equalityDeleteRowSchema()); + } + + return equalityDeleteFlinkType; + } + + private RowType positionDeleteFlinkType() { + if (positionDeleteFlinkType == null) { + // wrap the optional row schema into the position delete schema that contains path and + // position + Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(positionDeleteRowSchema()); + this.positionDeleteFlinkType = FlinkSchemaUtil.convert(positionDeleteSchema); + } + + return positionDeleteFlinkType; + } + + static class Builder { + private final Table table; + private FileFormat dataFileFormat; + private Schema dataSchema; + private RowType dataFlinkType; + private SortOrder dataSortOrder; + private FileFormat deleteFileFormat; + private int[] equalityFieldIds; + private Schema equalityDeleteRowSchema; + private RowType equalityDeleteFlinkType; + private SortOrder equalityDeleteSortOrder; + private Schema positionDeleteRowSchema; + private RowType positionDeleteFlinkType; + + Builder(Table table) { + this.table = table; + + Map properties = table.properties(); + + String dataFileFormatName = + properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); + this.dataFileFormat = FileFormat.fromString(dataFileFormatName); + + String deleteFileFormatName = + properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); + this.deleteFileFormat = FileFormat.fromString(deleteFileFormatName); + } + + Builder dataFileFormat(FileFormat newDataFileFormat) { + this.dataFileFormat = newDataFileFormat; + return this; + } + + Builder dataSchema(Schema newDataSchema) { + this.dataSchema = newDataSchema; + return this; + } + + /** + * Sets a Flink type for data. + * + *

If not set, the value is derived from the provided Iceberg schema. + */ + Builder dataFlinkType(RowType newDataFlinkType) { + this.dataFlinkType = newDataFlinkType; + return this; + } + + Builder dataSortOrder(SortOrder newDataSortOrder) { + this.dataSortOrder = newDataSortOrder; + return this; + } + + Builder deleteFileFormat(FileFormat newDeleteFileFormat) { + this.deleteFileFormat = newDeleteFileFormat; + return this; + } + + Builder equalityFieldIds(int[] newEqualityFieldIds) { + this.equalityFieldIds = newEqualityFieldIds; + return this; + } + + Builder equalityDeleteRowSchema(Schema newEqualityDeleteRowSchema) { + this.equalityDeleteRowSchema = newEqualityDeleteRowSchema; + return this; + } + + /** + * Sets a Flink type for equality deletes. + * + *

If not set, the value is derived from the provided Iceberg schema. + */ + Builder equalityDeleteFlinkType(RowType newEqualityDeleteFlinkType) { + this.equalityDeleteFlinkType = newEqualityDeleteFlinkType; + return this; + } + + Builder equalityDeleteSortOrder(SortOrder newEqualityDeleteSortOrder) { + this.equalityDeleteSortOrder = newEqualityDeleteSortOrder; + return this; + } + + Builder positionDeleteRowSchema(Schema newPositionDeleteRowSchema) { + this.positionDeleteRowSchema = newPositionDeleteRowSchema; + return this; + } + + /** + * Sets a Flink type for position deletes. + * + *

If not set, the value is derived from the provided Iceberg schema. + */ + Builder positionDeleteFlinkType(RowType newPositionDeleteFlinkType) { + this.positionDeleteFlinkType = newPositionDeleteFlinkType; + return this; + } + + FlinkFileWriterFactory build() { + boolean noEqualityDeleteConf = equalityFieldIds == null && equalityDeleteRowSchema == null; + boolean fullEqualityDeleteConf = equalityFieldIds != null && equalityDeleteRowSchema != null; + Preconditions.checkArgument( + noEqualityDeleteConf || fullEqualityDeleteConf, + "Equality field IDs and equality delete row schema must be set together"); + + return new FlinkFileWriterFactory( + table, + dataFileFormat, + dataSchema, + dataFlinkType, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteFlinkType, + equalityDeleteSortOrder, + positionDeleteRowSchema, + positionDeleteFlinkType); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java new file mode 100644 index 000000000000..13affd8484aa --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.function.Supplier; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.ManifestFiles; +import org.apache.iceberg.ManifestWriter; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Table; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class FlinkManifestUtil { + + private static final Logger LOG = LoggerFactory.getLogger(FlinkManifestUtil.class); + private static final int FORMAT_V2 = 2; + private static final Long DUMMY_SNAPSHOT_ID = 0L; + + private FlinkManifestUtil() {} + + static ManifestFile writeDataFiles( + OutputFile outputFile, PartitionSpec spec, List dataFiles) throws IOException { + ManifestWriter writer = + ManifestFiles.write(FORMAT_V2, spec, outputFile, DUMMY_SNAPSHOT_ID); + + try (ManifestWriter closeableWriter = writer) { + closeableWriter.addAll(dataFiles); + } + + return writer.toManifestFile(); + } + + static List readDataFiles( + ManifestFile manifestFile, FileIO io, Map specsById) + throws IOException { + try (CloseableIterable dataFiles = ManifestFiles.read(manifestFile, io, specsById)) { + return Lists.newArrayList(dataFiles); + } + } + + public static ManifestOutputFileFactory createOutputFileFactory( + Supplier

tableSupplier, + Map tableProps, + String flinkJobId, + String operatorUniqueId, + int subTaskId, + long attemptNumber) { + return new ManifestOutputFileFactory( + tableSupplier, tableProps, flinkJobId, operatorUniqueId, subTaskId, attemptNumber); + } + + /** + * Write the {@link WriteResult} to temporary manifest files. + * + * @param result all those DataFiles/DeleteFiles in this WriteResult should be written with same + * partition spec + */ + public static DeltaManifests writeCompletedFiles( + WriteResult result, Supplier outputFileSupplier, PartitionSpec spec) + throws IOException { + + ManifestFile dataManifest = null; + ManifestFile deleteManifest = null; + + // Write the completed data files into a newly created data manifest file. + if (result.dataFiles() != null && result.dataFiles().length > 0) { + dataManifest = + writeDataFiles(outputFileSupplier.get(), spec, Lists.newArrayList(result.dataFiles())); + } + + // Write the completed delete files into a newly created delete manifest file. + if (result.deleteFiles() != null && result.deleteFiles().length > 0) { + OutputFile deleteManifestFile = outputFileSupplier.get(); + + ManifestWriter deleteManifestWriter = + ManifestFiles.writeDeleteManifest(FORMAT_V2, spec, deleteManifestFile, DUMMY_SNAPSHOT_ID); + try (ManifestWriter writer = deleteManifestWriter) { + for (DeleteFile deleteFile : result.deleteFiles()) { + writer.add(deleteFile); + } + } + + deleteManifest = deleteManifestWriter.toManifestFile(); + } + + return new DeltaManifests(dataManifest, deleteManifest, result.referencedDataFiles()); + } + + public static WriteResult readCompletedFiles( + DeltaManifests deltaManifests, FileIO io, Map specsById) + throws IOException { + WriteResult.Builder builder = WriteResult.builder(); + + // Read the completed data files from persisted data manifest file. + if (deltaManifests.dataManifest() != null) { + builder.addDataFiles(readDataFiles(deltaManifests.dataManifest(), io, specsById)); + } + + // Read the completed delete files from persisted delete manifests file. + if (deltaManifests.deleteManifest() != null) { + try (CloseableIterable deleteFiles = + ManifestFiles.readDeleteManifest(deltaManifests.deleteManifest(), io, specsById)) { + builder.addDeleteFiles(deleteFiles); + } + } + + return builder.addReferencedDataFiles(deltaManifests.referencedDataFiles()).build(); + } + + public static void deleteCommittedManifests( + Table table, List manifests, String newFlinkJobId, long checkpointId) { + deleteCommittedManifests(table.name(), table.io(), manifests, newFlinkJobId, checkpointId); + } + + static void deleteCommittedManifests( + String tableName, + FileIO io, + List manifestsPath, + String newFlinkJobId, + long checkpointId) { + for (ManifestFile manifest : manifestsPath) { + try { + io.deleteFile(manifest.path()); + } catch (Exception e) { + // The flink manifests cleaning failure shouldn't abort the completed checkpoint. + String details = + MoreObjects.toStringHelper(FlinkManifestUtil.class) + .add("tableName", tableName) + .add("flinkJobId", newFlinkJobId) + .add("checkpointId", checkpointId) + .add("manifestPath", manifest) + .toString(); + LOG.warn( + "The iceberg transaction has been committed, but we failed to clean the temporary flink manifests: {}", + details, + e); + } + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java new file mode 100644 index 000000000000..d83a11d0f462 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java @@ -0,0 +1,783 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.time.Duration; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.Function; +import org.apache.flink.api.common.functions.FlatMapFunction; +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.streaming.api.functions.sink.v2.DiscardingSink; +import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.util.DataFormatConverters; +import org.apache.flink.table.legacy.api.TableSchema; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Partitioning; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.sink.shuffle.DataStatisticsOperatorFactory; +import org.apache.iceberg.flink.sink.shuffle.RangePartitioner; +import org.apache.iceberg.flink.sink.shuffle.StatisticsOrRecord; +import org.apache.iceberg.flink.sink.shuffle.StatisticsOrRecordTypeInformation; +import org.apache.iceberg.flink.sink.shuffle.StatisticsType; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.SerializableSupplier; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class FlinkSink { + private static final Logger LOG = LoggerFactory.getLogger(FlinkSink.class); + + private static final String ICEBERG_STREAM_WRITER_NAME = + IcebergStreamWriter.class.getSimpleName(); + private static final String ICEBERG_FILES_COMMITTER_NAME = + IcebergFilesCommitter.class.getSimpleName(); + + private FlinkSink() {} + + /** + * Initialize a {@link Builder} to export the data from generic input data stream into iceberg + * table. We use {@link RowData} inside the sink connector, so users need to provide a mapper + * function and a {@link TypeInformation} to convert those generic records to a RowData + * DataStream. + * + * @param input the generic source input data stream. + * @param mapper function to convert the generic data to {@link RowData} + * @param outputType to define the {@link TypeInformation} for the input data. + * @param the data type of records. + * @return {@link Builder} to connect the iceberg table. + */ + public static Builder builderFor( + DataStream input, MapFunction mapper, TypeInformation outputType) { + return new Builder().forMapperOutputType(input, mapper, outputType); + } + + /** + * Initialize a {@link Builder} to export the data from input data stream with {@link Row}s into + * iceberg table. We use {@link RowData} inside the sink connector, so users need to provide a + * {@link TableSchema} for builder to convert those {@link Row}s to a {@link RowData} DataStream. + * + * @param input the source input data stream with {@link Row}s. + * @param tableSchema defines the {@link TypeInformation} for input data. + * @return {@link Builder} to connect the iceberg table. + * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link #forRow(DataStream, + * ResolvedSchema)} instead. + */ + @Deprecated + public static Builder forRow(DataStream input, TableSchema tableSchema) { + RowType rowType = (RowType) tableSchema.toRowDataType().getLogicalType(); + DataType[] fieldDataTypes = tableSchema.getFieldDataTypes(); + + DataFormatConverters.RowConverter rowConverter = + new DataFormatConverters.RowConverter(fieldDataTypes); + return builderFor(input, rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)) + .tableSchema(tableSchema); + } + + /** + * Initialize a {@link Builder} to export the data from input data stream with {@link Row}s into + * iceberg table. We use {@link RowData} inside the sink connector, so users need to provide a + * {@link ResolvedSchema} for builder to convert those {@link Row}s to a {@link RowData} + * DataStream. + * + * @param input the source input data stream with {@link Row}s. + * @param resolvedSchema defines the {@link TypeInformation} for input data. + * @return {@link Builder} to connect the iceberg table. + */ + public static Builder forRow(DataStream input, ResolvedSchema resolvedSchema) { + RowType rowType = (RowType) resolvedSchema.toSinkRowDataType().getLogicalType(); + DataType[] fieldDataTypes = resolvedSchema.getColumnDataTypes().toArray(DataType[]::new); + + DataFormatConverters.RowConverter rowConverter = + new DataFormatConverters.RowConverter(fieldDataTypes); + return builderFor(input, rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)) + .resolvedSchema(resolvedSchema); + } + + /** + * Initialize a {@link Builder} to export the data from input data stream with {@link RowData}s + * into iceberg table. + * + * @param input the source input data stream with {@link RowData}s. + * @return {@link Builder} to connect the iceberg table. + */ + public static Builder forRowData(DataStream input) { + return new Builder().forRowData(input); + } + + public static class Builder implements IcebergSinkBuilder { + private Function> inputCreator = null; + private TableLoader tableLoader; + private Table table; + @Deprecated private TableSchema tableSchema; + private ResolvedSchema resolvedSchema; + private List equalityFieldColumns = null; + private String uidPrefix = null; + private final Map snapshotProperties = Maps.newHashMap(); + private ReadableConfig readableConfig = new Configuration(); + private final Map writeOptions = Maps.newHashMap(); + private FlinkWriteConf flinkWriteConf = null; + + private Builder() {} + + private Builder forRowData(DataStream newRowDataInput) { + this.inputCreator = ignored -> newRowDataInput; + return this; + } + + private Builder forMapperOutputType( + DataStream input, MapFunction mapper, TypeInformation outputType) { + this.inputCreator = + newUidPrefix -> { + // Input stream order is crucial for some situation(e.g. in cdc case). Therefore, we + // need to set the parallelism + // of map operator same as its input to keep map operator chaining its input, and avoid + // rebalanced by default. + SingleOutputStreamOperator inputStream = + input.map(mapper, outputType).setParallelism(input.getParallelism()); + if (newUidPrefix != null) { + inputStream.name(operatorName(newUidPrefix)).uid(newUidPrefix + "-mapper"); + } + return inputStream; + }; + return this; + } + + /** + * This iceberg {@link Table} instance is used for initializing {@link IcebergStreamWriter} + * which will write all the records into {@link DataFile}s and emit them to downstream operator. + * Providing a table would avoid so many table loading from each separate task. + * + * @param newTable the loaded iceberg table instance. + * @return {@link Builder} to connect the iceberg table. + */ + @Override + public Builder table(Table newTable) { + this.table = newTable; + return this; + } + + /** + * The table loader is used for loading tables in {@link IcebergFilesCommitter} lazily, we need + * this loader because {@link Table} is not serializable and could not just use the loaded table + * from Builder#table in the remote task manager. + * + * @param newTableLoader to load iceberg table inside tasks. + * @return {@link Builder} to connect the iceberg table. + */ + @Override + public Builder tableLoader(TableLoader newTableLoader) { + this.tableLoader = newTableLoader; + return this; + } + + /** + * Set the write properties for Flink sink. View the supported properties in {@link + * FlinkWriteOptions} + */ + public Builder set(String property, String value) { + writeOptions.put(property, value); + return this; + } + + /** + * Set the write properties for Flink sink. View the supported properties in {@link + * FlinkWriteOptions} + */ + @Override + public Builder setAll(Map properties) { + writeOptions.putAll(properties); + return this; + } + + @Override + public Builder tableSchema(TableSchema newTableSchema) { + this.tableSchema = newTableSchema; + return this; + } + + @Override + public Builder resolvedSchema(ResolvedSchema newResolvedSchema) { + this.resolvedSchema = newResolvedSchema; + return this; + } + + @Override + public Builder overwrite(boolean newOverwrite) { + writeOptions.put(FlinkWriteOptions.OVERWRITE_MODE.key(), Boolean.toString(newOverwrite)); + return this; + } + + @Override + public Builder flinkConf(ReadableConfig config) { + this.readableConfig = config; + return this; + } + + /** + * Configure the write {@link DistributionMode} that the flink sink will use. Currently, flink + * support {@link DistributionMode#NONE} and {@link DistributionMode#HASH}. + * + * @param mode to specify the write distribution mode. + * @return {@link Builder} to connect the iceberg table. + */ + @Override + public Builder distributionMode(DistributionMode mode) { + if (mode != null) { + writeOptions.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), mode.modeName()); + } + return this; + } + + /** + * Range distribution needs to collect statistics about data distribution to properly shuffle + * the records in relatively balanced way. In general, low cardinality should use {@link + * StatisticsType#Map} and high cardinality should use {@link StatisticsType#Sketch} Refer to + * {@link StatisticsType} Javadoc for more details. + * + *

Default is {@link StatisticsType#Auto} where initially Map statistics is used. But if + * cardinality is higher than the threshold (currently 10K) as defined in {@code + * SketchUtil#OPERATOR_SKETCH_SWITCH_THRESHOLD}, statistics collection automatically switches to + * the sketch reservoir sampling. + * + *

Explicit set the statistics type if the default behavior doesn't work. + * + * @param type to specify the statistics type for range distribution. + * @return {@link Builder} to connect the iceberg table. + */ + public Builder rangeDistributionStatisticsType(StatisticsType type) { + if (type != null) { + writeOptions.put(FlinkWriteOptions.RANGE_DISTRIBUTION_STATISTICS_TYPE.key(), type.name()); + } + return this; + } + + /** + * If sort order contains partition columns, each sort key would map to one partition and data + * file. This relative weight can avoid placing too many small files for sort keys with low + * traffic. It is a double value that defines the minimal weight for each sort key. `0.02` means + * each key has a base weight of `2%` of the targeted traffic weight per writer task. + * + *

E.g. the sink Iceberg table is partitioned daily by event time. Assume the data stream + * contains events from now up to 180 days ago. With event time, traffic weight distribution + * across different days typically has a long tail pattern. Current day contains the most + * traffic. The older days (long tail) contain less and less traffic. Assume writer parallelism + * is `10`. The total weight across all 180 days is `10,000`. Target traffic weight per writer + * task would be `1,000`. Assume the weight sum for the oldest 150 days is `1,000`. Normally, + * the range partitioner would put all the oldest 150 days in one writer task. That writer task + * would write to 150 small files (one per day). Keeping 150 open files can potentially consume + * large amount of memory. Flushing and uploading 150 files (however small) at checkpoint time + * can also be potentially slow. If this config is set to `0.02`. It means every sort key has a + * base weight of `2%` of targeted weight of `1,000` for every write task. It would essentially + * avoid placing more than `50` data files (one per day) on one writer task no matter how small + * they are. + * + *

This is only applicable to {@link StatisticsType#Map} for low-cardinality scenario. For + * {@link StatisticsType#Sketch} high-cardinality sort columns, they are usually not used as + * partition columns. Otherwise, too many partitions and small files may be generated during + * write. Sketch range partitioner simply splits high-cardinality keys into ordered ranges. + * + *

Default is {@code 0.0%}. + */ + public Builder rangeDistributionSortKeyBaseWeight(double weight) { + writeOptions.put( + FlinkWriteOptions.RANGE_DISTRIBUTION_SORT_KEY_BASE_WEIGHT.key(), Double.toString(weight)); + return this; + } + + /** + * Configuring the write parallel number for iceberg stream writer. + * + * @param newWriteParallelism the number of parallel iceberg stream writer. + * @return {@link Builder} to connect the iceberg table. + */ + @Override + public Builder writeParallelism(int newWriteParallelism) { + writeOptions.put( + FlinkWriteOptions.WRITE_PARALLELISM.key(), Integer.toString(newWriteParallelism)); + return this; + } + + /** + * All INSERT/UPDATE_AFTER events from input stream will be transformed to UPSERT events, which + * means it will DELETE the old records and then INSERT the new records. In partitioned table, + * the partition fields should be a subset of equality fields, otherwise the old row that + * located in partition-A could not be deleted by the new row that located in partition-B. + * + * @param enabled indicate whether it should transform all INSERT/UPDATE_AFTER events to UPSERT. + * @return {@link Builder} to connect the iceberg table. + */ + @Override + public Builder upsert(boolean enabled) { + writeOptions.put(FlinkWriteOptions.WRITE_UPSERT_ENABLED.key(), Boolean.toString(enabled)); + return this; + } + + /** + * Configuring the equality field columns for iceberg table that accept CDC or UPSERT events. + * + * @param columns defines the iceberg table's key. + * @return {@link Builder} to connect the iceberg table. + */ + @Override + public Builder equalityFieldColumns(List columns) { + this.equalityFieldColumns = columns; + return this; + } + + /** + * Set the uid prefix for FlinkSink operators. Note that FlinkSink internally consists of + * multiple operators (like writer, committer, dummy sink etc.) Actually operator uid will be + * appended with a suffix like "uidPrefix-writer".
+ *
+ * If provided, this prefix is also applied to operator names.
+ *
+ * Flink auto generates operator uid if not set explicitly. It is a recommended + * best-practice to set uid for all operators before deploying to production. Flink has an + * option to {@code pipeline.auto-generate-uid=false} to disable auto-generation and force + * explicit setting of all operator uid.
+ *
+ * Be careful with setting this for an existing job, because now we are changing the operator + * uid from an auto-generated one to this new value. When deploying the change with a + * checkpoint, Flink won't be able to restore the previous Flink sink operator state (more + * specifically the committer operator state). You need to use {@code --allowNonRestoredState} + * to ignore the previous sink state. During restore Flink sink state is used to check if last + * commit was actually successful or not. {@code --allowNonRestoredState} can lead to data loss + * if the Iceberg commit failed in the last completed checkpoint. + * + * @param newPrefix prefix for Flink sink operator uid and name + * @return {@link Builder} to connect the iceberg table. + */ + public Builder uidPrefix(String newPrefix) { + this.uidPrefix = newPrefix; + return this; + } + + public Builder setSnapshotProperties(Map properties) { + snapshotProperties.putAll(properties); + return this; + } + + public Builder setSnapshotProperty(String property, String value) { + snapshotProperties.put(property, value); + return this; + } + + @Override + public Builder toBranch(String branch) { + writeOptions.put(FlinkWriteOptions.BRANCH.key(), branch); + return this; + } + + private DataStreamSink chainIcebergOperators() { + Preconditions.checkArgument( + inputCreator != null, + "Please use forRowData() or forMapperOutputType() to initialize the input DataStream."); + Preconditions.checkNotNull(tableLoader, "Table loader shouldn't be null"); + + DataStream rowDataInput = inputCreator.apply(uidPrefix); + + if (table == null) { + if (!tableLoader.isOpen()) { + tableLoader.open(); + } + + try (TableLoader loader = tableLoader) { + this.table = loader.loadTable(); + } catch (IOException e) { + throw new UncheckedIOException( + "Failed to load iceberg table from table loader: " + tableLoader, e); + } + } + + flinkWriteConf = new FlinkWriteConf(table, writeOptions, readableConfig); + + // Find out the equality field id list based on the user-provided equality field column names. + Set equalityFieldIds = + SinkUtil.checkAndGetEqualityFieldIds(table, equalityFieldColumns); + + RowType flinkRowType = + resolvedSchema != null + ? toFlinkRowType(table.schema(), resolvedSchema) + : toFlinkRowType(table.schema(), tableSchema); + int writerParallelism = + flinkWriteConf.writeParallelism() == null + ? rowDataInput.getParallelism() + : flinkWriteConf.writeParallelism(); + + // Distribute the records from input data stream based on the write.distribution-mode and + // equality fields. + DataStream distributeStream = + distributeDataStream(rowDataInput, equalityFieldIds, flinkRowType, writerParallelism); + + // Add parallel writers that append rows to files + SingleOutputStreamOperator writerStream = + appendWriter(distributeStream, flinkRowType, equalityFieldIds, writerParallelism); + + // Add single-parallelism committer that commits files + // after successful checkpoint or end of input + SingleOutputStreamOperator committerStream = appendCommitter(writerStream); + + // Add dummy discard sink + return appendDummySink(committerStream); + } + + /** + * Append the iceberg sink operators to write records to iceberg table. + * + * @return {@link DataStreamSink} for sink. + */ + @Override + public DataStreamSink append() { + return chainIcebergOperators(); + } + + private String operatorName(String suffix) { + return uidPrefix != null ? uidPrefix + "-" + suffix : suffix; + } + + @VisibleForTesting + List checkAndGetEqualityFieldIds() { + List equalityFieldIds = Lists.newArrayList(table.schema().identifierFieldIds()); + if (equalityFieldColumns != null && !equalityFieldColumns.isEmpty()) { + Set equalityFieldSet = + Sets.newHashSetWithExpectedSize(equalityFieldColumns.size()); + for (String column : equalityFieldColumns) { + org.apache.iceberg.types.Types.NestedField field = table.schema().findField(column); + Preconditions.checkNotNull( + field, + "Missing required equality field column '%s' in table schema %s", + column, + table.schema()); + equalityFieldSet.add(field.fieldId()); + } + + if (!equalityFieldSet.equals(table.schema().identifierFieldIds())) { + LOG.warn( + "The configured equality field column IDs {} are not matched with the schema identifier field IDs" + + " {}, use job specified equality field columns as the equality fields by default.", + equalityFieldSet, + table.schema().identifierFieldIds()); + } + equalityFieldIds = Lists.newArrayList(equalityFieldSet); + } + return equalityFieldIds; + } + + private DataStreamSink appendDummySink(SingleOutputStreamOperator committerStream) { + DataStreamSink resultStream = + committerStream + .sinkTo(new DiscardingSink<>()) + .name(operatorName(String.format("IcebergSink %s", this.table.name()))) + .setParallelism(1); + if (uidPrefix != null) { + resultStream = resultStream.uid(uidPrefix + "-dummysink"); + } + return resultStream; + } + + private SingleOutputStreamOperator appendCommitter( + SingleOutputStreamOperator writerStream) { + OneInputStreamOperatorFactory filesCommitterFactory = + new IcebergFilesCommitterFactory( + tableLoader, + flinkWriteConf.overwriteMode(), + snapshotProperties, + flinkWriteConf.workerPoolSize(), + flinkWriteConf.branch(), + table.spec()); + SingleOutputStreamOperator committerStream = + writerStream + .transform( + operatorName(ICEBERG_FILES_COMMITTER_NAME), Types.VOID, filesCommitterFactory) + .setParallelism(1) + .setMaxParallelism(1); + if (uidPrefix != null) { + committerStream = committerStream.uid(uidPrefix + "-committer"); + } + return committerStream; + } + + private SingleOutputStreamOperator appendWriter( + DataStream input, + RowType flinkRowType, + Set equalityFieldIds, + int writerParallelism) { + // Validate the equality fields and partition fields if we enable the upsert mode. + if (flinkWriteConf.upsertMode()) { + Preconditions.checkState( + !flinkWriteConf.overwriteMode(), + "OVERWRITE mode shouldn't be enable when configuring to use UPSERT data stream."); + Preconditions.checkState( + !equalityFieldIds.isEmpty(), + "Equality field columns shouldn't be empty when configuring to use UPSERT data stream."); + if (!table.spec().isUnpartitioned()) { + for (PartitionField partitionField : table.spec().fields()) { + Preconditions.checkState( + equalityFieldIds.contains(partitionField.sourceId()), + "In UPSERT mode, source column '%s' of partition field '%s', should be included in equality fields: '%s'", + table.schema().findColumnName(partitionField.sourceId()), + partitionField, + equalityFieldColumns); + } + } + } + + SerializableTable serializableTable = (SerializableTable) SerializableTable.copyOf(table); + Duration tableRefreshInterval = flinkWriteConf.tableRefreshInterval(); + + SerializableSupplier

tableSupplier; + if (tableRefreshInterval != null) { + tableSupplier = + new CachingTableSupplier(serializableTable, tableLoader, tableRefreshInterval); + } else { + tableSupplier = () -> serializableTable; + } + + IcebergStreamWriter streamWriter = + createStreamWriter(tableSupplier, flinkWriteConf, flinkRowType, equalityFieldIds); + + SingleOutputStreamOperator writerStream = + input + .transform( + operatorName(ICEBERG_STREAM_WRITER_NAME), + TypeInformation.of(FlinkWriteResult.class), + streamWriter) + .setParallelism(writerParallelism); + if (uidPrefix != null) { + writerStream = writerStream.uid(uidPrefix + "-writer"); + } + return writerStream; + } + + private DataStream distributeDataStream( + DataStream input, + Set equalityFieldIds, + RowType flinkRowType, + int writerParallelism) { + DistributionMode writeMode = flinkWriteConf.distributionMode(); + LOG.info("Write distribution mode is '{}'", writeMode.modeName()); + + Schema iSchema = table.schema(); + PartitionSpec partitionSpec = table.spec(); + SortOrder sortOrder = table.sortOrder(); + + switch (writeMode) { + case NONE: + if (equalityFieldIds.isEmpty()) { + return input; + } else { + LOG.info("Distribute rows by equality fields, because there are equality fields set"); + return input.keyBy( + new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); + } + + case HASH: + if (equalityFieldIds.isEmpty()) { + if (partitionSpec.isUnpartitioned()) { + LOG.warn( + "Fallback to use 'none' distribution mode, because there are no equality fields set " + + "and table is unpartitioned"); + return input; + } else { + return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); + } + } else { + if (partitionSpec.isUnpartitioned()) { + LOG.info( + "Distribute rows by equality fields, because there are equality fields set " + + "and table is unpartitioned"); + return input.keyBy( + new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); + } else { + for (PartitionField partitionField : partitionSpec.fields()) { + Preconditions.checkState( + equalityFieldIds.contains(partitionField.sourceId()), + "In 'hash' distribution mode with equality fields set, source column '%s' of partition field '%s' " + + "should be included in equality fields: '%s'", + table.schema().findColumnName(partitionField.sourceId()), + partitionField, + equalityFieldColumns); + } + return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); + } + } + + case RANGE: + // Ideally, exception should be thrown in the combination of range distribution and + // equality fields. Primary key case should use hash distribution mode. + // Keep the current behavior of falling back to keyBy for backward compatibility. + if (!equalityFieldIds.isEmpty()) { + LOG.warn( + "Hash distribute rows by equality fields, even though {}=range is set. " + + "Range distribution for primary keys are not always safe in " + + "Flink streaming writer.", + WRITE_DISTRIBUTION_MODE); + return input.keyBy( + new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); + } + + // range distribute by partition key or sort key if table has an SortOrder + Preconditions.checkState( + sortOrder.isSorted() || partitionSpec.isPartitioned(), + "Invalid write distribution mode: range. Need to define sort order or partition spec."); + if (sortOrder.isUnsorted()) { + sortOrder = Partitioning.sortOrderFor(partitionSpec); + LOG.info("Construct sort order from partition spec"); + } + + LOG.info("Range distribute rows by sort order: {}", sortOrder); + StatisticsOrRecordTypeInformation statisticsOrRecordTypeInformation = + new StatisticsOrRecordTypeInformation(flinkRowType, iSchema, sortOrder); + StatisticsType statisticsType = flinkWriteConf.rangeDistributionStatisticsType(); + SingleOutputStreamOperator shuffleStream = + input + .transform( + operatorName("range-shuffle"), + statisticsOrRecordTypeInformation, + new DataStatisticsOperatorFactory( + iSchema, + sortOrder, + writerParallelism, + statisticsType, + flinkWriteConf.rangeDistributionSortKeyBaseWeight())) + // Set the parallelism same as input operator to encourage chaining + .setParallelism(input.getParallelism()); + if (uidPrefix != null) { + shuffleStream = shuffleStream.uid(uidPrefix + "-shuffle"); + } + + return shuffleStream + .partitionCustom(new RangePartitioner(iSchema, sortOrder), r -> r) + .flatMap( + (FlatMapFunction) + (statisticsOrRecord, out) -> { + if (statisticsOrRecord.hasRecord()) { + out.collect(statisticsOrRecord.record()); + } + }) + // Set the parallelism same as writerParallelism to + // promote operator chaining with the downstream writer operator + .setParallelism(writerParallelism) + .returns(RowData.class); + + default: + throw new RuntimeException("Unrecognized " + WRITE_DISTRIBUTION_MODE + ": " + writeMode); + } + } + } + + /** + * Clean up after removing {@link Builder#tableSchema} + * + * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link #toFlinkRowType(Schema, + * ResolvedSchema)} instead. + */ + @Deprecated + static RowType toFlinkRowType(Schema schema, TableSchema requestedSchema) { + if (requestedSchema != null) { + // Convert the flink schema to iceberg schema firstly, then reassign ids to match the existing + // iceberg schema. + Schema writeSchema = TypeUtil.reassignIds(FlinkSchemaUtil.convert(requestedSchema), schema); + TypeUtil.validateWriteSchema(schema, writeSchema, true, true); + + // We use this flink schema to read values from RowData. The flink's TINYINT and SMALLINT will + // be promoted to iceberg INTEGER, that means if we use iceberg's table schema to read TINYINT + // (backend by 1 'byte'), we will read 4 bytes rather than 1 byte, it will mess up the + // byte array in BinaryRowData. So here we must use flink schema. + return (RowType) requestedSchema.toRowDataType().getLogicalType(); + } else { + return FlinkSchemaUtil.convert(schema); + } + } + + static RowType toFlinkRowType(Schema schema, ResolvedSchema requestedSchema) { + if (requestedSchema != null) { + // Convert the flink schema to iceberg schema firstly, then reassign ids to match the existing + // iceberg schema. + Schema writeSchema = TypeUtil.reassignIds(FlinkSchemaUtil.convert(requestedSchema), schema); + TypeUtil.validateWriteSchema(schema, writeSchema, true, true); + + // We use this flink schema to read values from RowData. The flink's TINYINT and SMALLINT will + // be promoted to iceberg INTEGER, that means if we use iceberg's table schema to read TINYINT + // (backend by 1 'byte'), we will read 4 bytes rather than 1 byte, it will mess up the + // byte array in BinaryRowData. So here we must use flink schema. + return (RowType) requestedSchema.toSinkRowDataType().getLogicalType(); + } else { + return FlinkSchemaUtil.convert(schema); + } + } + + static IcebergStreamWriter createStreamWriter( + SerializableSupplier
tableSupplier, + FlinkWriteConf flinkWriteConf, + RowType flinkRowType, + Set equalityFieldIds) { + Preconditions.checkArgument(tableSupplier != null, "Iceberg table supplier shouldn't be null"); + + Table initTable = tableSupplier.get(); + FileFormat format = flinkWriteConf.dataFileFormat(); + TaskWriterFactory taskWriterFactory = + new RowDataTaskWriterFactory( + tableSupplier, + flinkRowType, + flinkWriteConf.targetDataFileSize(), + format, + SinkUtil.writeProperties(format, flinkWriteConf, initTable), + equalityFieldIds, + flinkWriteConf.upsertMode()); + + return new IcebergStreamWriter<>(initTable.name(), taskWriterFactory); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkWriteResult.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkWriteResult.java new file mode 100644 index 000000000000..317fb169ae1b --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkWriteResult.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.Serializable; +import org.apache.iceberg.io.WriteResult; + +public class FlinkWriteResult implements Serializable { + private final long checkpointId; + private final WriteResult writeResult; + + public FlinkWriteResult(long checkpointId, WriteResult writeResult) { + this.checkpointId = checkpointId; + this.writeResult = writeResult; + } + + public long checkpointId() { + return checkpointId; + } + + public WriteResult writeResult() { + return writeResult; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittable.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittable.java new file mode 100644 index 000000000000..408c3e9a9d5f --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittable.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.Objects; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; + +/** + * The aggregated results of a single checkpoint which should be committed. Containing the + * serialized {@link org.apache.iceberg.flink.sink.DeltaManifests} file - which contains the commit + * data, and the jobId, operatorId, checkpointId triplet which helps identifying the specific commit + * + *

{@link IcebergCommittableSerializer} is used for serializing the objects between the Writer + * and the Aggregator operator and between the Aggregator and the Committer as well. + */ +class IcebergCommittable implements Serializable { + private final byte[] manifest; + private final String jobId; + private final String operatorId; + private final long checkpointId; + + IcebergCommittable(byte[] manifest, String jobId, String operatorId, long checkpointId) { + this.manifest = manifest; + this.jobId = jobId; + this.operatorId = operatorId; + this.checkpointId = checkpointId; + } + + byte[] manifest() { + return manifest; + } + + String jobId() { + return jobId; + } + + String operatorId() { + return operatorId; + } + + Long checkpointId() { + return checkpointId; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("jobId", jobId) + .add("checkpointId", checkpointId) + .add("operatorId", operatorId) + .toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (o == null || getClass() != o.getClass()) { + return false; + } + + IcebergCommittable that = (IcebergCommittable) o; + return checkpointId == that.checkpointId + && Arrays.equals(manifest, that.manifest) + && Objects.equals(jobId, that.jobId) + && Objects.equals(operatorId, that.operatorId); + } + + @Override + public int hashCode() { + int result = Objects.hash(jobId, operatorId, checkpointId); + result = 31 * result + Arrays.hashCode(manifest); + return result; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittableSerializer.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittableSerializer.java new file mode 100644 index 000000000000..1d83c211e001 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittableSerializer.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputViewStreamWrapper; + +/** + * This serializer is used for serializing the {@link IcebergCommittable} objects between the Writer + * and the Aggregator operator and between the Aggregator and the Committer as well. + * + *

In both cases only the respective part is serialized. + */ +public class IcebergCommittableSerializer implements SimpleVersionedSerializer { + private static final int VERSION = 1; + + @Override + public int getVersion() { + return VERSION; + } + + @Override + public byte[] serialize(IcebergCommittable committable) throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + DataOutputViewStreamWrapper view = new DataOutputViewStreamWrapper(out); + view.writeUTF(committable.jobId()); + view.writeUTF(committable.operatorId()); + view.writeLong(committable.checkpointId()); + view.writeInt(committable.manifest().length); + view.write(committable.manifest()); + return out.toByteArray(); + } + + @Override + public IcebergCommittable deserialize(int version, byte[] serialized) throws IOException { + if (version == 1) { + DataInputDeserializer view = new DataInputDeserializer(serialized); + String jobId = view.readUTF(); + String operatorId = view.readUTF(); + long checkpointId = view.readLong(); + int manifestLen = view.readInt(); + byte[] manifestBuf; + manifestBuf = new byte[manifestLen]; + view.read(manifestBuf); + return new IcebergCommittable(manifestBuf, jobId, operatorId, checkpointId); + } + throw new IOException("Unrecognized version or corrupt state: " + version); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommitter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommitter.java new file mode 100644 index 000000000000..c05e7d918093 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommitter.java @@ -0,0 +1,317 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; +import org.apache.flink.api.connector.sink2.Committer; +import org.apache.flink.core.io.SimpleVersionedSerialization; +import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.ReplacePartitions; +import org.apache.iceberg.RowDelta; +import org.apache.iceberg.SnapshotUpdate; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.PropertyUtil; +import org.apache.iceberg.util.ThreadPools; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class implements the Flink SinkV2 {@link Committer} interface to implement the Iceberg + * commits. The implementation builds on the following assumptions: + * + *

    + *
  • There is a single {@link IcebergCommittable} for every checkpoint + *
  • There is no late checkpoint - if checkpoint 'x' has received in one call, then after a + * successful run only checkpoints > x will arrive + *
  • There is no other writer which would generate another commit to the same branch with the + * same jobId-operatorId-checkpointId triplet + *
+ */ +class IcebergCommitter implements Committer { + private static final Logger LOG = LoggerFactory.getLogger(IcebergCommitter.class); + private static final byte[] EMPTY_MANIFEST_DATA = new byte[0]; + public static final WriteResult EMPTY_WRITE_RESULT = + WriteResult.builder() + .addDataFiles(Lists.newArrayList()) + .addDeleteFiles(Lists.newArrayList()) + .build(); + + @VisibleForTesting + static final String MAX_CONTINUOUS_EMPTY_COMMITS = "flink.max-continuous-empty-commits"; + + private final String branch; + private final Map snapshotProperties; + private final boolean replacePartitions; + private IcebergFilesCommitterMetrics committerMetrics; + private Table table; + private final TableLoader tableLoader; + private int maxContinuousEmptyCommits; + private ExecutorService workerPool; + private int continuousEmptyCheckpoints = 0; + private boolean compactMode = false; + + IcebergCommitter( + TableLoader tableLoader, + String branch, + Map snapshotProperties, + boolean replacePartitions, + int workerPoolSize, + String sinkId, + IcebergFilesCommitterMetrics committerMetrics, + boolean compactMode) { + this.branch = branch; + this.snapshotProperties = snapshotProperties; + this.replacePartitions = replacePartitions; + this.committerMetrics = committerMetrics; + this.tableLoader = tableLoader; + if (!tableLoader.isOpen()) { + tableLoader.open(); + } + + this.table = tableLoader.loadTable(); + this.maxContinuousEmptyCommits = + PropertyUtil.propertyAsInt(table.properties(), MAX_CONTINUOUS_EMPTY_COMMITS, 10); + Preconditions.checkArgument( + maxContinuousEmptyCommits > 0, MAX_CONTINUOUS_EMPTY_COMMITS + " must be positive"); + this.workerPool = + ThreadPools.newFixedThreadPool( + "iceberg-committer-pool-" + table.name() + "-" + sinkId, workerPoolSize); + this.continuousEmptyCheckpoints = 0; + this.compactMode = compactMode; + } + + @Override + public void commit(Collection> commitRequests) + throws IOException, InterruptedException { + if (commitRequests.isEmpty()) { + return; + } + + NavigableMap> commitRequestMap = Maps.newTreeMap(); + for (CommitRequest request : commitRequests) { + commitRequestMap.put(request.getCommittable().checkpointId(), request); + } + + IcebergCommittable last = commitRequestMap.lastEntry().getValue().getCommittable(); + long maxCommittedCheckpointId = + SinkUtil.getMaxCommittedCheckpointId(table, last.jobId(), last.operatorId(), branch); + // Mark the already committed FilesCommittable(s) as finished + commitRequestMap + .headMap(maxCommittedCheckpointId, true) + .values() + .forEach(CommitRequest::signalAlreadyCommitted); + NavigableMap> uncommitted = + commitRequestMap.tailMap(maxCommittedCheckpointId, false); + if (!uncommitted.isEmpty()) { + commitPendingRequests(uncommitted, last.jobId(), last.operatorId()); + } + } + + /** + * Commits the data to the Iceberg table by reading the file data from the {@link + * org.apache.iceberg.flink.sink.DeltaManifests} ordered by the checkpointId, and writing the new + * snapshot to the Iceberg table. The {@link org.apache.iceberg.SnapshotSummary} will contain the + * jobId, snapshotId, checkpointId so in case of job restart we can identify which changes are + * committed, and which are still waiting for the commit. + * + * @param commitRequestMap The checkpointId to {@link CommitRequest} map of the changes to commit + * @param newFlinkJobId The jobId to store in the {@link org.apache.iceberg.SnapshotSummary} + * @param operatorId The operatorId to store in the {@link org.apache.iceberg.SnapshotSummary} + * @throws IOException On commit failure + */ + private void commitPendingRequests( + NavigableMap> commitRequestMap, + String newFlinkJobId, + String operatorId) + throws IOException { + long checkpointId = commitRequestMap.lastKey(); + List manifests = Lists.newArrayList(); + NavigableMap pendingResults = Maps.newTreeMap(); + for (Map.Entry> e : commitRequestMap.entrySet()) { + if (Arrays.equals(EMPTY_MANIFEST_DATA, e.getValue().getCommittable().manifest())) { + pendingResults.put(e.getKey(), EMPTY_WRITE_RESULT); + } else { + DeltaManifests deltaManifests = + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, e.getValue().getCommittable().manifest()); + pendingResults.put( + e.getKey(), + FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io(), table.specs())); + manifests.addAll(deltaManifests.manifests()); + } + } + + CommitSummary summary = new CommitSummary(pendingResults); + commitPendingResult(pendingResults, summary, newFlinkJobId, operatorId); + if (committerMetrics != null) { + committerMetrics.updateCommitSummary(summary); + } + + if (!compactMode) { + FlinkManifestUtil.deleteCommittedManifests(table, manifests, newFlinkJobId, checkpointId); + } + } + + private void logCommitSummary(CommitSummary summary, String description) { + LOG.info( + "Preparing for commit: {} on table: {} branch: {} with summary: {}.", + description, + table, + branch, + summary); + } + + private void commitPendingResult( + NavigableMap pendingResults, + CommitSummary summary, + String newFlinkJobId, + String operatorId) { + long totalFiles = summary.dataFilesCount() + summary.deleteFilesCount(); + continuousEmptyCheckpoints = totalFiles == 0 ? continuousEmptyCheckpoints + 1 : 0; + if (totalFiles != 0 || continuousEmptyCheckpoints % maxContinuousEmptyCommits == 0) { + if (replacePartitions) { + replacePartitions(pendingResults, summary, newFlinkJobId, operatorId); + } else { + commitDeltaTxn(pendingResults, summary, newFlinkJobId, operatorId); + } + continuousEmptyCheckpoints = 0; + } else { + long checkpointId = pendingResults.lastKey(); + LOG.info("Skip commit for checkpoint {} due to no data files or delete files.", checkpointId); + } + } + + private void replacePartitions( + NavigableMap pendingResults, + CommitSummary summary, + String newFlinkJobId, + String operatorId) { + long checkpointId = pendingResults.lastKey(); + Preconditions.checkState( + summary.deleteFilesCount() == 0, "Cannot overwrite partitions with delete files."); + // Commit the overwrite transaction. + ReplacePartitions dynamicOverwrite = table.newReplacePartitions().scanManifestsWith(workerPool); + for (WriteResult result : pendingResults.values()) { + Preconditions.checkState( + result.referencedDataFiles().length == 0, "Should have no referenced data files."); + Arrays.stream(result.dataFiles()).forEach(dynamicOverwrite::addFile); + } + String description = "dynamic partition overwrite"; + + logCommitSummary(summary, description); + commitOperation(dynamicOverwrite, description, newFlinkJobId, operatorId, checkpointId); + } + + private void commitDeltaTxn( + NavigableMap pendingResults, + CommitSummary summary, + String newFlinkJobId, + String operatorId) { + long checkpointId = pendingResults.lastKey(); + if (summary.deleteFilesCount() == 0) { + // To be compatible with iceberg format V1. + AppendFiles appendFiles = table.newAppend().scanManifestsWith(workerPool); + for (WriteResult result : pendingResults.values()) { + Preconditions.checkState( + result.referencedDataFiles().length == 0, + "Should have no referenced data files for append."); + Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); + } + String description = "append"; + logCommitSummary(summary, description); + // fail all commits as really its only one + commitOperation(appendFiles, description, newFlinkJobId, operatorId, checkpointId); + } else { + // To be compatible with iceberg format V2. + for (Map.Entry e : pendingResults.entrySet()) { + // We don't commit the merged result into a single transaction because for the sequential + // transaction txn1 and txn2, the equality-delete files of txn2 are required to be applied + // to data files from txn1. Committing the merged one will lead to the incorrect delete + // semantic. + WriteResult result = e.getValue(); + + // Row delta validations are not needed for streaming changes that write equality deletes. + // Equality deletes are applied to data in all previous sequence numbers, so retries may + // push deletes further in the future, but do not affect correctness. Position deletes + // committed to the table in this path are used only to delete rows from data files that are + // being added in this commit. There is no way for data files added along with the delete + // files to be concurrently removed, so there is no need to validate the files referenced by + // the position delete files that are being committed. + RowDelta rowDelta = table.newRowDelta().scanManifestsWith(workerPool); + + Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows); + Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); + + String description = "rowDelta"; + logCommitSummary(summary, description); + commitOperation(rowDelta, description, newFlinkJobId, operatorId, e.getKey()); + } + } + } + + private void commitOperation( + SnapshotUpdate operation, + String description, + String newFlinkJobId, + String operatorId, + long checkpointId) { + + snapshotProperties.forEach(operation::set); + // custom snapshot metadata properties will be overridden if they conflict with internal ones + // used by the sink. + operation.set(SinkUtil.MAX_COMMITTED_CHECKPOINT_ID, Long.toString(checkpointId)); + operation.set(SinkUtil.FLINK_JOB_ID, newFlinkJobId); + operation.set(SinkUtil.OPERATOR_ID, operatorId); + operation.toBranch(branch); + + long startNano = System.nanoTime(); + operation.commit(); // abort is automatically called if this fails. + long durationMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano); + LOG.info( + "Committed {} to table: {}, branch: {}, checkpointId {} in {} ms", + description, + table.name(), + branch, + checkpointId, + durationMs); + if (committerMetrics != null) { + committerMetrics.commitDuration(durationMs); + } + } + + @Override + public void close() throws IOException { + tableLoader.close(); + workerPool.shutdown(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java new file mode 100644 index 000000000000..89432cff2b29 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java @@ -0,0 +1,486 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.SortedMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo; +import org.apache.flink.core.io.SimpleVersionedSerialization; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.apache.flink.runtime.state.StateSnapshotContext; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.operators.StreamOperatorParameters; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.table.runtime.typeutils.SortedMapTypeInfo; +import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.ReplacePartitions; +import org.apache.iceberg.RowDelta; +import org.apache.iceberg.SnapshotUpdate; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.base.Strings; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Comparators; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.PropertyUtil; +import org.apache.iceberg.util.ThreadPools; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class IcebergFilesCommitter extends AbstractStreamOperator + implements OneInputStreamOperator, BoundedOneInput { + + private static final long serialVersionUID = 1L; + private static final long INITIAL_CHECKPOINT_ID = -1L; + private static final byte[] EMPTY_MANIFEST_DATA = new byte[0]; + + private static final Logger LOG = LoggerFactory.getLogger(IcebergFilesCommitter.class); + private static final String FLINK_JOB_ID = "flink.job-id"; + private static final String OPERATOR_ID = "flink.operator-id"; + + // The max checkpoint id we've committed to iceberg table. As the flink's checkpoint is always + // increasing, so we could correctly commit all the data files whose checkpoint id is greater than + // the max committed one to iceberg table, for avoiding committing the same data files twice. This + // id will be attached to iceberg's meta when committing the iceberg transaction. + private static final String MAX_COMMITTED_CHECKPOINT_ID = "flink.max-committed-checkpoint-id"; + static final String MAX_CONTINUOUS_EMPTY_COMMITS = "flink.max-continuous-empty-commits"; + + // TableLoader to load iceberg table lazily. + private final TableLoader tableLoader; + private final boolean replacePartitions; + private final Map snapshotProperties; + + // A sorted map to maintain the completed data files for each pending checkpointId (which have not + // been committed to iceberg table). We need a sorted map here because there's possible that few + // checkpoints snapshot failed, for example: the 1st checkpoint have 2 data files <1, >, the 2st checkpoint have 1 data files <2, >. Snapshot for checkpoint#1 + // interrupted because of network/disk failure etc, while we don't expect any data loss in iceberg + // table. So we keep the finished files <1, > in memory and retry to commit iceberg + // table when the next checkpoint happen. + private final NavigableMap dataFilesPerCheckpoint = Maps.newTreeMap(); + + // The completed files cache for current checkpoint. Once the snapshot barrier received, it will + // be flushed to the 'dataFilesPerCheckpoint'. + private final Map> writeResultsSinceLastSnapshot = Maps.newHashMap(); + private final String branch; + + // It will have an unique identifier for one job. + private transient String flinkJobId; + private transient String operatorUniqueId; + private transient Table table; + private transient IcebergFilesCommitterMetrics committerMetrics; + private transient ManifestOutputFileFactory manifestOutputFileFactory; + private transient long maxCommittedCheckpointId; + private transient int continuousEmptyCheckpoints; + private transient int maxContinuousEmptyCommits; + // There're two cases that we restore from flink checkpoints: the first case is restoring from + // snapshot created by the same flink job; another case is restoring from snapshot created by + // another different job. For the second case, we need to maintain the old flink job's id in flink + // state backend to find the max-committed-checkpoint-id when traversing iceberg table's + // snapshots. + private static final ListStateDescriptor JOB_ID_DESCRIPTOR = + new ListStateDescriptor<>("iceberg-flink-job-id", BasicTypeInfo.STRING_TYPE_INFO); + private transient ListState jobIdState; + // All pending checkpoints states for this function. + private static final ListStateDescriptor> STATE_DESCRIPTOR = + buildStateDescriptor(); + private transient ListState> checkpointsState; + + private final Integer workerPoolSize; + private final PartitionSpec spec; + private transient ExecutorService workerPool; + + IcebergFilesCommitter( + StreamOperatorParameters parameters, + TableLoader tableLoader, + boolean replacePartitions, + Map snapshotProperties, + Integer workerPoolSize, + String branch, + PartitionSpec spec) { + super(parameters); + this.tableLoader = tableLoader; + this.replacePartitions = replacePartitions; + this.snapshotProperties = snapshotProperties; + this.workerPoolSize = workerPoolSize; + this.branch = branch; + this.spec = spec; + } + + @Override + public void initializeState(StateInitializationContext context) throws Exception { + super.initializeState(context); + this.flinkJobId = getContainingTask().getEnvironment().getJobID().toString(); + this.operatorUniqueId = getRuntimeContext().getOperatorUniqueID(); + + // Open the table loader and load the table. + this.tableLoader.open(); + this.table = tableLoader.loadTable(); + this.committerMetrics = new IcebergFilesCommitterMetrics(super.metrics, table.name()); + + maxContinuousEmptyCommits = + PropertyUtil.propertyAsInt(table.properties(), MAX_CONTINUOUS_EMPTY_COMMITS, 10); + Preconditions.checkArgument( + maxContinuousEmptyCommits > 0, MAX_CONTINUOUS_EMPTY_COMMITS + " must be positive"); + + int subTaskId = getRuntimeContext().getTaskInfo().getIndexOfThisSubtask(); + int attemptId = getRuntimeContext().getTaskInfo().getAttemptNumber(); + this.manifestOutputFileFactory = + FlinkManifestUtil.createOutputFileFactory( + () -> table, table.properties(), flinkJobId, operatorUniqueId, subTaskId, attemptId); + this.maxCommittedCheckpointId = INITIAL_CHECKPOINT_ID; + + this.checkpointsState = context.getOperatorStateStore().getListState(STATE_DESCRIPTOR); + this.jobIdState = context.getOperatorStateStore().getListState(JOB_ID_DESCRIPTOR); + if (context.isRestored()) { + Iterable jobIdIterable = jobIdState.get(); + if (jobIdIterable == null || !jobIdIterable.iterator().hasNext()) { + LOG.warn( + "Failed to restore committer state. This can happen when operator uid changed and Flink " + + "allowNonRestoredState is enabled. Best practice is to explicitly set the operator id " + + "via FlinkSink#Builder#uidPrefix() so that the committer operator uid is stable. " + + "Otherwise, Flink auto generate an operator uid based on job topology." + + "With that, operator uid is subjective to change upon topology change."); + return; + } + + String restoredFlinkJobId = jobIdIterable.iterator().next(); + Preconditions.checkState( + !Strings.isNullOrEmpty(restoredFlinkJobId), + "Flink job id parsed from checkpoint snapshot shouldn't be null or empty"); + + // Since flink's checkpoint id will start from the max-committed-checkpoint-id + 1 in the new + // flink job even if it's restored from a snapshot created by another different flink job, so + // it's safe to assign the max committed checkpoint id from restored flink job to the current + // flink job. + this.maxCommittedCheckpointId = + SinkUtil.getMaxCommittedCheckpointId(table, restoredFlinkJobId, operatorUniqueId, branch); + + NavigableMap uncommittedDataFiles = + Maps.newTreeMap(checkpointsState.get().iterator().next()) + .tailMap(maxCommittedCheckpointId, false); + if (!uncommittedDataFiles.isEmpty()) { + // Committed all uncommitted data files from the old flink job to iceberg table. + long maxUncommittedCheckpointId = uncommittedDataFiles.lastKey(); + commitUpToCheckpoint( + uncommittedDataFiles, restoredFlinkJobId, operatorUniqueId, maxUncommittedCheckpointId); + } + } + } + + @Override + public void snapshotState(StateSnapshotContext context) throws Exception { + super.snapshotState(context); + long checkpointId = context.getCheckpointId(); + LOG.info( + "Start to flush snapshot state to state backend, table: {}, checkpointId: {}", + table, + checkpointId); + + // Update the checkpoint state. + long startNano = System.nanoTime(); + writeToManifestUptoLatestCheckpoint(checkpointId); + + // Reset the snapshot state to the latest state. + checkpointsState.clear(); + checkpointsState.add(dataFilesPerCheckpoint); + + jobIdState.clear(); + jobIdState.add(flinkJobId); + + committerMetrics.checkpointDuration( + TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano)); + } + + @Override + public void notifyCheckpointComplete(long checkpointId) throws Exception { + super.notifyCheckpointComplete(checkpointId); + // It's possible that we have the following events: + // 1. snapshotState(ckpId); + // 2. snapshotState(ckpId+1); + // 3. notifyCheckpointComplete(ckpId+1); + // 4. notifyCheckpointComplete(ckpId); + // For step#4, we don't need to commit iceberg table again because in step#3 we've committed all + // the files, + // Besides, we need to maintain the max-committed-checkpoint-id to be increasing. + if (checkpointId > maxCommittedCheckpointId) { + LOG.info("Checkpoint {} completed. Attempting commit.", checkpointId); + commitUpToCheckpoint(dataFilesPerCheckpoint, flinkJobId, operatorUniqueId, checkpointId); + this.maxCommittedCheckpointId = checkpointId; + } else { + LOG.info( + "Skipping committing checkpoint {}. {} is already committed.", + checkpointId, + maxCommittedCheckpointId); + } + + // reload the table in case new configuration is needed + this.table = tableLoader.loadTable(); + } + + private void commitUpToCheckpoint( + NavigableMap deltaManifestsMap, + String newFlinkJobId, + String operatorId, + long checkpointId) + throws IOException { + NavigableMap pendingMap = deltaManifestsMap.headMap(checkpointId, true); + List manifests = Lists.newArrayList(); + NavigableMap pendingResults = Maps.newTreeMap(); + for (Map.Entry e : pendingMap.entrySet()) { + if (Arrays.equals(EMPTY_MANIFEST_DATA, e.getValue())) { + // Skip the empty flink manifest. + continue; + } + + DeltaManifests deltaManifests = + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, e.getValue()); + pendingResults.put( + e.getKey(), + FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io(), table.specs())); + manifests.addAll(deltaManifests.manifests()); + } + + CommitSummary summary = new CommitSummary(pendingResults); + commitPendingResult(pendingResults, summary, newFlinkJobId, operatorId, checkpointId); + committerMetrics.updateCommitSummary(summary); + pendingMap.clear(); + FlinkManifestUtil.deleteCommittedManifests(table, manifests, newFlinkJobId, checkpointId); + } + + private void commitPendingResult( + NavigableMap pendingResults, + CommitSummary summary, + String newFlinkJobId, + String operatorId, + long checkpointId) { + long totalFiles = summary.dataFilesCount() + summary.deleteFilesCount(); + continuousEmptyCheckpoints = totalFiles == 0 ? continuousEmptyCheckpoints + 1 : 0; + if (totalFiles != 0 || continuousEmptyCheckpoints % maxContinuousEmptyCommits == 0) { + if (replacePartitions) { + replacePartitions(pendingResults, summary, newFlinkJobId, operatorId, checkpointId); + } else { + commitDeltaTxn(pendingResults, summary, newFlinkJobId, operatorId, checkpointId); + } + continuousEmptyCheckpoints = 0; + } else { + LOG.info("Skip commit for checkpoint {} due to no data files or delete files.", checkpointId); + } + } + + private void replacePartitions( + NavigableMap pendingResults, + CommitSummary summary, + String newFlinkJobId, + String operatorId, + long checkpointId) { + Preconditions.checkState( + summary.deleteFilesCount() == 0, "Cannot overwrite partitions with delete files."); + // Commit the overwrite transaction. + ReplacePartitions dynamicOverwrite = table.newReplacePartitions().scanManifestsWith(workerPool); + for (WriteResult result : pendingResults.values()) { + Preconditions.checkState( + result.referencedDataFiles().length == 0, "Should have no referenced data files."); + Arrays.stream(result.dataFiles()).forEach(dynamicOverwrite::addFile); + } + + commitOperation( + dynamicOverwrite, + summary, + "dynamic partition overwrite", + newFlinkJobId, + operatorId, + checkpointId); + } + + private void commitDeltaTxn( + NavigableMap pendingResults, + CommitSummary summary, + String newFlinkJobId, + String operatorId, + long checkpointId) { + if (summary.deleteFilesCount() == 0) { + // To be compatible with iceberg format V1. + AppendFiles appendFiles = table.newAppend().scanManifestsWith(workerPool); + for (WriteResult result : pendingResults.values()) { + Preconditions.checkState( + result.referencedDataFiles().length == 0, + "Should have no referenced data files for append."); + Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); + } + commitOperation(appendFiles, summary, "append", newFlinkJobId, operatorId, checkpointId); + } else { + // To be compatible with iceberg format V2. + for (Map.Entry e : pendingResults.entrySet()) { + // We don't commit the merged result into a single transaction because for the sequential + // transaction txn1 and txn2, the equality-delete files of txn2 are required to be applied + // to data files from txn1. Committing the merged one will lead to the incorrect delete + // semantic. + WriteResult result = e.getValue(); + + // Row delta validations are not needed for streaming changes that write equality deletes. + // Equality deletes are applied to data in all previous sequence numbers, so retries may + // push deletes further in the future, but do not affect correctness. Position deletes + // committed to the table in this path are used only to delete rows from data files that are + // being added in this commit. There is no way for data files added along with the delete + // files to be concurrently removed, so there is no need to validate the files referenced by + // the position delete files that are being committed. + RowDelta rowDelta = table.newRowDelta().scanManifestsWith(workerPool); + + Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows); + Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); + commitOperation(rowDelta, summary, "rowDelta", newFlinkJobId, operatorId, e.getKey()); + } + } + } + + private void commitOperation( + SnapshotUpdate operation, + CommitSummary summary, + String description, + String newFlinkJobId, + String operatorId, + long checkpointId) { + LOG.info( + "Committing {} for checkpoint {} to table {} branch {} with summary: {}", + description, + checkpointId, + table.name(), + branch, + summary); + snapshotProperties.forEach(operation::set); + // custom snapshot metadata properties will be overridden if they conflict with internal ones + // used by the sink. + operation.set(MAX_COMMITTED_CHECKPOINT_ID, Long.toString(checkpointId)); + operation.set(FLINK_JOB_ID, newFlinkJobId); + operation.set(OPERATOR_ID, operatorId); + operation.toBranch(branch); + + long startNano = System.nanoTime(); + operation.commit(); // abort is automatically called if this fails. + long durationMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano); + LOG.info( + "Committed {} to table: {}, branch: {}, checkpointId {} in {} ms", + description, + table.name(), + branch, + checkpointId, + durationMs); + committerMetrics.commitDuration(durationMs); + } + + @Override + public void processElement(StreamRecord element) { + FlinkWriteResult flinkWriteResult = element.getValue(); + List writeResults = + writeResultsSinceLastSnapshot.computeIfAbsent( + flinkWriteResult.checkpointId(), k -> Lists.newArrayList()); + writeResults.add(flinkWriteResult.writeResult()); + } + + @Override + public void endInput() throws IOException { + // Flush the buffered data files into 'dataFilesPerCheckpoint' firstly. + long currentCheckpointId = IcebergStreamWriter.END_INPUT_CHECKPOINT_ID; + writeToManifestUptoLatestCheckpoint(currentCheckpointId); + commitUpToCheckpoint(dataFilesPerCheckpoint, flinkJobId, operatorUniqueId, currentCheckpointId); + } + + private void writeToManifestUptoLatestCheckpoint(long checkpointId) throws IOException { + if (!writeResultsSinceLastSnapshot.containsKey(checkpointId)) { + dataFilesPerCheckpoint.put(checkpointId, EMPTY_MANIFEST_DATA); + } + + for (Map.Entry> writeResultsOfCheckpoint : + writeResultsSinceLastSnapshot.entrySet()) { + dataFilesPerCheckpoint.put( + writeResultsOfCheckpoint.getKey(), + writeToManifest(writeResultsOfCheckpoint.getKey(), writeResultsOfCheckpoint.getValue())); + } + + // Clear the local buffer for current checkpoint. + writeResultsSinceLastSnapshot.clear(); + } + + /** + * Write all the complete data files to a newly created manifest file and return the manifest's + * avro serialized bytes. + */ + private byte[] writeToManifest(long checkpointId, List writeResults) + throws IOException { + WriteResult result = WriteResult.builder().addAll(writeResults).build(); + DeltaManifests deltaManifests = + FlinkManifestUtil.writeCompletedFiles( + result, () -> manifestOutputFileFactory.create(checkpointId), spec); + + return SimpleVersionedSerialization.writeVersionAndSerialize( + DeltaManifestsSerializer.INSTANCE, deltaManifests); + } + + @Override + public void open() throws Exception { + super.open(); + + final String operatorID = getRuntimeContext().getOperatorUniqueID(); + this.workerPool = + ThreadPools.newFixedThreadPool("iceberg-worker-pool-" + operatorID, workerPoolSize); + } + + @Override + public void close() throws Exception { + if (tableLoader != null) { + tableLoader.close(); + } + + if (workerPool != null) { + workerPool.shutdown(); + } + } + + @VisibleForTesting + static ListStateDescriptor> buildStateDescriptor() { + Comparator longComparator = Comparators.forType(Types.LongType.get()); + // Construct a SortedMapTypeInfo. + SortedMapTypeInfo sortedMapTypeInfo = + new SortedMapTypeInfo<>( + BasicTypeInfo.LONG_TYPE_INFO, + PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO, + longComparator); + return new ListStateDescriptor<>("iceberg-files-committer-state", sortedMapTypeInfo); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterFactory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterFactory.java new file mode 100644 index 000000000000..88c304e5a0a6 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterFactory.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.Map; +import org.apache.flink.streaming.api.operators.ChainingStrategy; +import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.StreamOperator; +import org.apache.flink.streaming.api.operators.StreamOperatorParameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.flink.TableLoader; + +class IcebergFilesCommitterFactory + implements OneInputStreamOperatorFactory { + + private final TableLoader tableLoader; + private final boolean overwriteMode; + private final Map snapshotProperties; + private final int workerPoolSize; + private final String branch; + private final PartitionSpec spec; + + IcebergFilesCommitterFactory( + TableLoader tableLoader, + boolean overwriteMode, + Map snapshotProperties, + int workerPoolSize, + String branch, + PartitionSpec spec) { + this.tableLoader = tableLoader; + this.overwriteMode = overwriteMode; + this.snapshotProperties = snapshotProperties; + this.workerPoolSize = workerPoolSize; + this.branch = branch; + this.spec = spec; + } + + @Override + public StreamOperator createStreamOperator(StreamOperatorParameters parameters) { + return new IcebergFilesCommitter( + parameters, tableLoader, overwriteMode, snapshotProperties, workerPoolSize, branch, spec); + } + + @Override + public void setChainingStrategy(ChainingStrategy strategy) {} + + @Override + public ChainingStrategy getChainingStrategy() { + return ChainingStrategy.ALWAYS; + } + + @Override + public Class> getStreamOperatorClass(ClassLoader classLoader) { + return IcebergFilesCommitter.class; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java new file mode 100644 index 000000000000..ce81ef11f13c --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.flink.annotation.Internal; +import org.apache.flink.metrics.Counter; +import org.apache.flink.metrics.MetricGroup; +import org.apache.iceberg.flink.util.ElapsedTimeGauge; + +@Internal +public class IcebergFilesCommitterMetrics { + private final AtomicLong lastCheckpointDurationMs = new AtomicLong(); + private final AtomicLong lastCommitDurationMs = new AtomicLong(); + private final ElapsedTimeGauge elapsedSecondsSinceLastSuccessfulCommit; + private final Counter committedDataFilesCount; + private final Counter committedDataFilesRecordCount; + private final Counter committedDataFilesByteCount; + private final Counter committedDeleteFilesCount; + private final Counter committedDeleteFilesRecordCount; + private final Counter committedDeleteFilesByteCount; + + public IcebergFilesCommitterMetrics(MetricGroup metrics, String fullTableName) { + MetricGroup committerMetrics = + metrics.addGroup("IcebergFilesCommitter").addGroup("table", fullTableName); + committerMetrics.gauge("lastCheckpointDurationMs", lastCheckpointDurationMs::get); + committerMetrics.gauge("lastCommitDurationMs", lastCommitDurationMs::get); + this.elapsedSecondsSinceLastSuccessfulCommit = new ElapsedTimeGauge(TimeUnit.SECONDS); + committerMetrics.gauge( + "elapsedSecondsSinceLastSuccessfulCommit", elapsedSecondsSinceLastSuccessfulCommit); + this.committedDataFilesCount = committerMetrics.counter("committedDataFilesCount"); + this.committedDataFilesRecordCount = committerMetrics.counter("committedDataFilesRecordCount"); + this.committedDataFilesByteCount = committerMetrics.counter("committedDataFilesByteCount"); + this.committedDeleteFilesCount = committerMetrics.counter("committedDeleteFilesCount"); + this.committedDeleteFilesRecordCount = + committerMetrics.counter("committedDeleteFilesRecordCount"); + this.committedDeleteFilesByteCount = committerMetrics.counter("committedDeleteFilesByteCount"); + } + + public void checkpointDuration(long checkpointDurationMs) { + lastCheckpointDurationMs.set(checkpointDurationMs); + } + + public void commitDuration(long commitDurationMs) { + lastCommitDurationMs.set(commitDurationMs); + } + + /** This is called upon a successful commit. */ + public void updateCommitSummary(CommitSummary stats) { + elapsedSecondsSinceLastSuccessfulCommit.refreshLastRecordedTime(); + committedDataFilesCount.inc(stats.dataFilesCount()); + committedDataFilesRecordCount.inc(stats.dataFilesRecordCount()); + committedDataFilesByteCount.inc(stats.dataFilesByteCount()); + committedDeleteFilesCount.inc(stats.deleteFilesCount()); + committedDeleteFilesRecordCount.inc(stats.deleteFilesRecordCount()); + committedDeleteFilesByteCount.inc(stats.deleteFilesByteCount()); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSink.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSink.java new file mode 100644 index 000000000000..752882a9d6c2 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSink.java @@ -0,0 +1,972 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.time.Duration; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.UUID; +import java.util.function.Function; +import org.apache.flink.annotation.Experimental; +import org.apache.flink.api.common.SupportsConcurrentExecutionAttempts; +import org.apache.flink.api.common.functions.FlatMapFunction; +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.sink2.Committer; +import org.apache.flink.api.connector.sink2.CommitterInitContext; +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.sink2.SinkWriter; +import org.apache.flink.api.connector.sink2.SupportsCommitter; +import org.apache.flink.api.connector.sink2.WriterInitContext; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessageTypeInfo; +import org.apache.flink.streaming.api.connector.sink2.SupportsPostCommitTopology; +import org.apache.flink.streaming.api.connector.sink2.SupportsPreCommitTopology; +import org.apache.flink.streaming.api.connector.sink2.SupportsPreWriteTopology; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.util.DataFormatConverters; +import org.apache.flink.table.legacy.api.TableSchema; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Partitioning; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.maintenance.api.FlinkMaintenanceConfig; +import org.apache.iceberg.flink.maintenance.api.LockConfig; +import org.apache.iceberg.flink.maintenance.api.RewriteDataFiles; +import org.apache.iceberg.flink.maintenance.api.RewriteDataFilesConfig; +import org.apache.iceberg.flink.maintenance.api.TableMaintenance; +import org.apache.iceberg.flink.maintenance.api.TriggerLockFactory; +import org.apache.iceberg.flink.maintenance.operator.LockFactoryBuilder; +import org.apache.iceberg.flink.maintenance.operator.TableChange; +import org.apache.iceberg.flink.sink.shuffle.DataStatisticsOperatorFactory; +import org.apache.iceberg.flink.sink.shuffle.RangePartitioner; +import org.apache.iceberg.flink.sink.shuffle.StatisticsOrRecord; +import org.apache.iceberg.flink.sink.shuffle.StatisticsOrRecordTypeInformation; +import org.apache.iceberg.flink.sink.shuffle.StatisticsType; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.SerializableSupplier; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Flink v2 sink offer different hooks to insert custom topologies into the sink. We will use the + * following: + * + *
    + *
  • {@link SupportsPreWriteTopology} which redistributes the data to the writers based on the + * {@link DistributionMode} + *
  • {@link org.apache.flink.api.connector.sink2.SinkWriter} which writes data/delete files, and + * generates the {@link org.apache.iceberg.io.WriteResult} objects for the files + *
  • {@link SupportsPreCommitTopology} which we use to place the {@link + * org.apache.iceberg.flink.sink.IcebergWriteAggregator} which merges the individual {@link + * org.apache.flink.api.connector.sink2.SinkWriter}'s {@link + * org.apache.iceberg.io.WriteResult}s to a single {@link + * org.apache.iceberg.flink.sink.IcebergCommittable} + *
  • {@link org.apache.iceberg.flink.sink.IcebergCommitter} which commits the incoming{@link + * org.apache.iceberg.flink.sink.IcebergCommittable}s to the Iceberg table + *
  • {@link SupportsPostCommitTopology} we could use for incremental compaction later. This is + * not implemented yet. + *
+ * + * The job graph looks like below: + * + *
{@code
+ *                            Flink sink
+ *               +-----------------------------------------------------------------------------------+
+ *               |                                                                                   |
+ * +-------+     | +----------+                               +-------------+      +---------------+ |
+ * | Map 1 | ==> | | writer 1 |                               | committer 1 | ---> | post commit 1 | |
+ * +-------+     | +----------+                               +-------------+      +---------------+ |
+ *               |             \                             /                \                      |
+ *               |              \                           /                  \                     |
+ *               |               \                         /                    \                    |
+ * +-------+     | +----------+   \ +-------------------+ /   +-------------+    \ +---------------+ |
+ * | Map 2 | ==> | | writer 2 | --->| commit aggregator |     | committer 2 |      | post commit 2 | |
+ * +-------+     | +----------+     +-------------------+     +-------------+      +---------------+ |
+ *               |                                             Commit only on                        |
+ *               |                                             committer 1                           |
+ *               +-----------------------------------------------------------------------------------+
+ * }
+ */ +@Experimental +public class IcebergSink + implements Sink, + SupportsPreWriteTopology, + SupportsCommitter, + SupportsPreCommitTopology, + SupportsPostCommitTopology, + SupportsConcurrentExecutionAttempts { + private static final Logger LOG = LoggerFactory.getLogger(IcebergSink.class); + private final TableLoader tableLoader; + private final Map snapshotProperties; + private final String uidSuffix; + private final String sinkId; + private final Map writeProperties; + private final RowType flinkRowType; + private final SerializableSupplier
tableSupplier; + private final transient FlinkWriteConf flinkWriteConf; + private final Set equalityFieldIds; + private final boolean upsertMode; + private final FileFormat dataFileFormat; + private final long targetDataFileSize; + private final String branch; + private final boolean overwriteMode; + private final int workerPoolSize; + private final boolean compactMode; + private final transient FlinkMaintenanceConfig flinkMaintenanceConfig; + + private final Table table; + private final Set equalityFieldColumns = null; + + private IcebergSink( + TableLoader tableLoader, + Table table, + Map snapshotProperties, + String uidSuffix, + Map writeProperties, + RowType flinkRowType, + SerializableSupplier
tableSupplier, + FlinkWriteConf flinkWriteConf, + Set equalityFieldIds, + String branch, + boolean overwriteMode, + FlinkMaintenanceConfig flinkMaintenanceConfig) { + this.tableLoader = tableLoader; + this.snapshotProperties = snapshotProperties; + this.uidSuffix = uidSuffix; + this.writeProperties = writeProperties; + this.flinkRowType = flinkRowType; + this.tableSupplier = tableSupplier; + this.flinkWriteConf = flinkWriteConf; + this.equalityFieldIds = equalityFieldIds; + this.branch = branch; + this.overwriteMode = overwriteMode; + this.table = table; + this.upsertMode = flinkWriteConf.upsertMode(); + this.dataFileFormat = flinkWriteConf.dataFileFormat(); + this.targetDataFileSize = flinkWriteConf.targetDataFileSize(); + this.workerPoolSize = flinkWriteConf.workerPoolSize(); + // We generate a random UUID every time when a sink is created. + // This is used to separate files generated by different sinks writing the same table. + // Also used to generate the aggregator operator name + this.sinkId = UUID.randomUUID().toString(); + this.compactMode = flinkWriteConf.compactMode(); + this.flinkMaintenanceConfig = flinkMaintenanceConfig; + } + + @Override + public SinkWriter createWriter(WriterInitContext context) { + RowDataTaskWriterFactory taskWriterFactory = + new RowDataTaskWriterFactory( + tableSupplier, + flinkRowType, + targetDataFileSize, + dataFileFormat, + writeProperties, + equalityFieldIds, + upsertMode); + IcebergStreamWriterMetrics metrics = + new IcebergStreamWriterMetrics(context.metricGroup(), table.name()); + return new IcebergSinkWriter( + tableSupplier.get().name(), + taskWriterFactory, + metrics, + context.getTaskInfo().getIndexOfThisSubtask(), + context.getTaskInfo().getAttemptNumber()); + } + + @Override + public Committer createCommitter(CommitterInitContext context) { + IcebergFilesCommitterMetrics metrics = + new IcebergFilesCommitterMetrics(context.metricGroup(), table.name()); + return new IcebergCommitter( + tableLoader, + branch, + snapshotProperties, + overwriteMode, + workerPoolSize, + sinkId, + metrics, + compactMode); + } + + @Override + public SimpleVersionedSerializer getCommittableSerializer() { + return new IcebergCommittableSerializer(); + } + + @Override + public void addPostCommitTopology( + DataStream> committables) { + + if (!compactMode) { + return; + } + + String suffix = defaultSuffix(uidSuffix, table.name()); + String postCommitUid = String.format("Sink post-commit : %s", suffix); + + SingleOutputStreamOperator tableChangeStream = + committables + .global() + .process(new CommittableToTableChangeConverter(table.io(), table.name(), table.specs())) + .uid(postCommitUid) + .forceNonParallel(); + try { + RewriteDataFilesConfig rewriteDataFilesConfig = + flinkMaintenanceConfig.createRewriteDataFilesConfig(); + RewriteDataFiles.Builder rewriteBuilder = + RewriteDataFiles.builder().config(rewriteDataFilesConfig); + + LockConfig lockConfig = flinkMaintenanceConfig.createLockConfig(); + TriggerLockFactory triggerLockFactory = LockFactoryBuilder.build(lockConfig, table.name()); + String tableMaintenanceUid = String.format("TableMaintenance : %s", suffix); + TableMaintenance.Builder builder = + TableMaintenance.forChangeStream(tableChangeStream, tableLoader, triggerLockFactory) + .uidSuffix(tableMaintenanceUid) + .add(rewriteBuilder); + + builder + .rateLimit(Duration.ofSeconds(flinkMaintenanceConfig.rateLimit())) + .lockCheckDelay(Duration.ofSeconds(flinkMaintenanceConfig.lockCheckDelay())) + .slotSharingGroup(flinkMaintenanceConfig.slotSharingGroup()) + .parallelism(flinkMaintenanceConfig.parallelism()) + .append(); + } catch (IOException e) { + throw new UncheckedIOException("Failed to create tableMaintenance ", e); + } + } + + @Override + public DataStream addPreWriteTopology(DataStream inputDataStream) { + return distributeDataStream(inputDataStream); + } + + @Override + public DataStream> addPreCommitTopology( + DataStream> writeResults) { + TypeInformation> typeInformation = + CommittableMessageTypeInfo.of(this::getCommittableSerializer); + + String suffix = defaultSuffix(uidSuffix, table.name()); + String preCommitAggregatorUid = String.format("Sink pre-commit aggregator: %s", suffix); + + // global forces all output records send to subtask 0 of the downstream committer operator. + // This is to ensure commit only happen in one committer subtask. + // Once upstream Flink provides the capability of setting committer operator + // parallelism to 1, this can be removed. + return writeResults + .global() + .transform(preCommitAggregatorUid, typeInformation, new IcebergWriteAggregator(tableLoader)) + .uid(preCommitAggregatorUid) + .setParallelism(1) + .setMaxParallelism(1) + // global forces all output records send to subtask 0 of the downstream committer operator. + // This is to ensure commit only happen in one committer subtask. + // Once upstream Flink provides the capability of setting committer operator + // parallelism to 1, this can be removed. + .global(); + } + + @Override + public SimpleVersionedSerializer getWriteResultSerializer() { + return new WriteResultSerializer(); + } + + public static class Builder implements IcebergSinkBuilder { + private TableLoader tableLoader; + private String uidSuffix = ""; + private Function> inputCreator = null; + @Deprecated private TableSchema tableSchema; + private ResolvedSchema resolvedSchema; + private SerializableTable table; + private final Map writeOptions = Maps.newHashMap(); + private final Map snapshotSummary = Maps.newHashMap(); + private ReadableConfig readableConfig = new Configuration(); + private List equalityFieldColumns = null; + + private Builder() {} + + private Builder forRowData(DataStream newRowDataInput) { + this.inputCreator = ignored -> newRowDataInput; + return this; + } + + /** + * Clean up after removing {@link IcebergSink#forRow(DataStream, TableSchema)} + * + * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link #forRow(DataStream, + * ResolvedSchema)} instead. + */ + @Deprecated + private Builder forRow(DataStream input, TableSchema inputTableSchema) { + RowType rowType = (RowType) inputTableSchema.toRowDataType().getLogicalType(); + DataType[] fieldDataTypes = inputTableSchema.getFieldDataTypes(); + + DataFormatConverters.RowConverter rowConverter = + new DataFormatConverters.RowConverter(fieldDataTypes); + return forMapperOutputType( + input, rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)) + .tableSchema(inputTableSchema); + } + + private Builder forRow(DataStream input, ResolvedSchema inputResolvedSchema) { + RowType rowType = (RowType) inputResolvedSchema.toSinkRowDataType().getLogicalType(); + DataType[] fieldDataTypes = inputResolvedSchema.getColumnDataTypes().toArray(DataType[]::new); + + DataFormatConverters.RowConverter rowConverter = + new DataFormatConverters.RowConverter(fieldDataTypes); + return forMapperOutputType( + input, rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)) + .resolvedSchema(inputResolvedSchema); + } + + private Builder forMapperOutputType( + DataStream input, MapFunction mapper, TypeInformation outputType) { + this.inputCreator = + newUidSuffix -> { + // Input stream order is crucial for some situation(e.g. in cdc case). Therefore, we + // need to set the parallelism of map operator same as its input to keep map operator + // chaining its input, and avoid rebalanced by default. + SingleOutputStreamOperator inputStream = + input.map(mapper, outputType).setParallelism(input.getParallelism()); + if (newUidSuffix != null) { + String uid = String.format("Sink pre-writer mapper: %s", newUidSuffix); + inputStream.name(uid).uid(uid); + } + return inputStream; + }; + return this; + } + + /** + * This iceberg {@link SerializableTable} instance is used for initializing {@link + * IcebergStreamWriter} which will write all the records into {@link DataFile}s and emit them to + * downstream operator. Providing a table would avoid so many table loading from each separate + * task. + * + * @param newTable the loaded iceberg table instance. + * @return {@link IcebergSink.Builder} to connect the iceberg table. + */ + @Override + public Builder table(Table newTable) { + this.table = (SerializableTable) SerializableTable.copyOf(newTable); + return this; + } + + /** + * The table loader is used for loading tables in {@link + * org.apache.iceberg.flink.sink.IcebergCommitter} lazily, we need this loader because {@link + * Table} is not serializable and could not just use the loaded table from Builder#table in the + * remote task manager. + * + * @param newTableLoader to load iceberg table inside tasks. + * @return {@link Builder} to connect the iceberg table. + */ + @Override + public Builder tableLoader(TableLoader newTableLoader) { + this.tableLoader = newTableLoader; + return this; + } + + TableLoader tableLoader() { + return tableLoader; + } + + /** + * Set the write properties for IcebergSink. View the supported properties in {@link + * FlinkWriteOptions} + */ + public Builder set(String property, String value) { + writeOptions.put(property, value); + return this; + } + + /** + * Set the write properties for IcebergSink. View the supported properties in {@link + * FlinkWriteOptions} + */ + @Override + public Builder setAll(Map properties) { + writeOptions.putAll(properties); + return this; + } + + @Override + public Builder tableSchema(TableSchema newTableSchema) { + this.tableSchema = newTableSchema; + return this; + } + + @Override + public Builder resolvedSchema(ResolvedSchema newResolvedSchema) { + this.resolvedSchema = newResolvedSchema; + return this; + } + + @Override + public Builder overwrite(boolean newOverwrite) { + writeOptions.put(FlinkWriteOptions.OVERWRITE_MODE.key(), Boolean.toString(newOverwrite)); + return this; + } + + @Override + public Builder flinkConf(ReadableConfig config) { + this.readableConfig = config; + return this; + } + + /** + * Configure the write {@link DistributionMode} that the IcebergSink will use. Currently, flink + * support {@link DistributionMode#NONE} and {@link DistributionMode#HASH} and {@link + * DistributionMode#RANGE} + * + * @param mode to specify the write distribution mode. + * @return {@link IcebergSink.Builder} to connect the iceberg table. + */ + @Override + public Builder distributionMode(DistributionMode mode) { + if (mode != null) { + writeOptions.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), mode.modeName()); + } + return this; + } + + /** + * Range distribution needs to collect statistics about data distribution to properly shuffle + * the records in relatively balanced way. In general, low cardinality should use {@link + * StatisticsType#Map} and high cardinality should use {@link StatisticsType#Sketch} Refer to + * {@link StatisticsType} Javadoc for more details. + * + *

Default is {@link StatisticsType#Auto} where initially Map statistics is used. But if + * cardinality is higher than the threshold (currently 10K) as defined in {@code + * SketchUtil#OPERATOR_SKETCH_SWITCH_THRESHOLD}, statistics collection automatically switches to + * the sketch reservoir sampling. + * + *

Explicit set the statistics type if the default behavior doesn't work. + * + * @param type to specify the statistics type for range distribution. + * @return {@link IcebergSink.Builder} to connect the iceberg table. + */ + public IcebergSink.Builder rangeDistributionStatisticsType(StatisticsType type) { + if (type != null) { + writeOptions.put(FlinkWriteOptions.RANGE_DISTRIBUTION_STATISTICS_TYPE.key(), type.name()); + } + return this; + } + + /** + * If sort order contains partition columns, each sort key would map to one partition and data + * file. This relative weight can avoid placing too many small files for sort keys with low + * traffic. It is a double value that defines the minimal weight for each sort key. `0.02` means + * each key has a base weight of `2%` of the targeted traffic weight per writer task. + * + *

E.g. the sink Iceberg table is partitioned daily by event time. Assume the data stream + * contains events from now up to 180 days ago. With event time, traffic weight distribution + * across different days typically has a long tail pattern. Current day contains the most + * traffic. The older days (long tail) contain less and less traffic. Assume writer parallelism + * is `10`. The total weight across all 180 days is `10,000`. Target traffic weight per writer + * task would be `1,000`. Assume the weight sum for the oldest 150 days is `1,000`. Normally, + * the range partitioner would put all the oldest 150 days in one writer task. That writer task + * would write to 150 small files (one per day). Keeping 150 open files can potentially consume + * large amount of memory. Flushing and uploading 150 files (however small) at checkpoint time + * can also be potentially slow. If this config is set to `0.02`. It means every sort key has a + * base weight of `2%` of targeted weight of `1,000` for every write task. It would essentially + * avoid placing more than `50` data files (one per day) on one writer task no matter how small + * they are. + * + *

This is only applicable to {@link StatisticsType#Map} for low-cardinality scenario. For + * {@link StatisticsType#Sketch} high-cardinality sort columns, they are usually not used as + * partition columns. Otherwise, too many partitions and small files may be generated during + * write. Sketch range partitioner simply splits high-cardinality keys into ordered ranges. + * + *

Default is {@code 0.0%}. + */ + public Builder rangeDistributionSortKeyBaseWeight(double weight) { + writeOptions.put( + FlinkWriteOptions.RANGE_DISTRIBUTION_SORT_KEY_BASE_WEIGHT.key(), Double.toString(weight)); + return this; + } + + /** + * Configuring the write parallel number for iceberg stream writer. + * + * @param newWriteParallelism the number of parallel iceberg stream writer. + * @return {@link IcebergSink.Builder} to connect the iceberg table. + */ + @Override + public Builder writeParallelism(int newWriteParallelism) { + writeOptions.put( + FlinkWriteOptions.WRITE_PARALLELISM.key(), Integer.toString(newWriteParallelism)); + return this; + } + + /** + * All INSERT/UPDATE_AFTER events from input stream will be transformed to UPSERT events, which + * means it will DELETE the old records and then INSERT the new records. In partitioned table, + * the partition fields should be a subset of equality fields, otherwise the old row that + * located in partition-A could not be deleted by the new row that located in partition-B. + * + * @param enabled indicate whether it should transform all INSERT/UPDATE_AFTER events to UPSERT. + * @return {@link IcebergSink.Builder} to connect the iceberg table. + */ + @Override + public Builder upsert(boolean enabled) { + writeOptions.put(FlinkWriteOptions.WRITE_UPSERT_ENABLED.key(), Boolean.toString(enabled)); + return this; + } + + /** + * Configuring the equality field columns for iceberg table that accept CDC or UPSERT events. + * + * @param columns defines the iceberg table's key. + * @return {@link Builder} to connect the iceberg table. + */ + @Override + public Builder equalityFieldColumns(List columns) { + this.equalityFieldColumns = columns; + return this; + } + + /** + * Set the uid suffix for IcebergSink operators. Note that IcebergSink internally consists of + * multiple operators (like writer, committer, aggregator). Actual operator uid will be appended + * with a suffix like "Sink Committer: $uidSuffix". + * + *

Flink auto generates operator uid if not set explicitly. It is a recommended + * best-practice to set uid for all operators before deploying to production. Flink has an + * option to {@code pipeline.auto-generate-uid=false} to disable auto-generation and force + * explicit setting of all operator uid. + * + *

Be careful with setting this for an existing job, because now we are changing the operator + * uid from an auto-generated one to this new value. When deploying the change with a + * checkpoint, Flink won't be able to restore the previous IcebergSink operator state (more + * specifically the committer operator state). You need to use {@code --allowNonRestoredState} + * to ignore the previous sink state. During restore IcebergSink state is used to check if last + * commit was actually successful or not. {@code --allowNonRestoredState} can lead to data loss + * if the Iceberg commit failed in the last completed checkpoint. + * + * @param newSuffix suffix for Flink sink operator uid and name + * @return {@link Builder} to connect the iceberg table. + */ + public Builder uidSuffix(String newSuffix) { + this.uidSuffix = newSuffix; + return this; + } + + public Builder snapshotProperties(Map properties) { + snapshotSummary.putAll(properties); + return this; + } + + public Builder setSnapshotProperty(String property, String value) { + snapshotSummary.put(property, value); + return this; + } + + @Override + public Builder toBranch(String branch) { + writeOptions.put(FlinkWriteOptions.BRANCH.key(), branch); + return this; + } + + IcebergSink build() { + + Preconditions.checkArgument( + inputCreator != null, + "Please use forRowData() or forMapperOutputType() to initialize the input DataStream."); + Preconditions.checkNotNull(tableLoader(), "Table loader shouldn't be null"); + + // Set the table if it is not yet set in the builder, so we can do the equalityId checks + SerializableTable serializableTable = checkAndGetTable(tableLoader(), table); + this.table = serializableTable; + // Init the `flinkWriteConf` here, so we can do the checks + FlinkWriteConf flinkWriteConf = new FlinkWriteConf(table, writeOptions, readableConfig); + + Duration tableRefreshInterval = flinkWriteConf.tableRefreshInterval(); + SerializableSupplier

tableSupplier; + if (tableRefreshInterval != null) { + tableSupplier = new CachingTableSupplier(table, tableLoader(), tableRefreshInterval); + } else { + tableSupplier = () -> serializableTable; + } + + boolean overwriteMode = flinkWriteConf.overwriteMode(); + + // Validate the equality fields and partition fields if we enable the upsert mode. + Set equalityFieldIds = + SinkUtil.checkAndGetEqualityFieldIds(table, equalityFieldColumns); + + if (flinkWriteConf.upsertMode()) { + Preconditions.checkState( + !overwriteMode, + "OVERWRITE mode shouldn't be enable when configuring to use UPSERT data stream."); + Preconditions.checkState( + !equalityFieldIds.isEmpty(), + "Equality field columns shouldn't be empty when configuring to use UPSERT data stream."); + if (!table.spec().isUnpartitioned()) { + for (PartitionField partitionField : table.spec().fields()) { + Preconditions.checkState( + equalityFieldIds.contains(partitionField.sourceId()), + "In 'hash' distribution mode with equality fields set, source column '%s' of partition field '%s' " + + "should be included in equality fields: '%s'", + table.schema().findColumnName(partitionField.sourceId()), + partitionField, + equalityFieldColumns); + } + } + } + + FlinkMaintenanceConfig flinkMaintenanceConfig = + new FlinkMaintenanceConfig(table, writeOptions, readableConfig); + return new IcebergSink( + tableLoader, + table, + snapshotSummary, + uidSuffix, + SinkUtil.writeProperties(flinkWriteConf.dataFileFormat(), flinkWriteConf, table), + resolvedSchema != null + ? toFlinkRowType(table.schema(), resolvedSchema) + : toFlinkRowType(table.schema(), tableSchema), + tableSupplier, + flinkWriteConf, + equalityFieldIds, + flinkWriteConf.branch(), + overwriteMode, + flinkMaintenanceConfig); + } + + /** + * Append the iceberg sink operators to write records to iceberg table. + * + * @return {@link DataStreamSink} for sink. + */ + @Override + public DataStreamSink append() { + IcebergSink sink = build(); + String suffix = defaultSuffix(uidSuffix, table.name()); + DataStream rowDataInput = inputCreator.apply(suffix); + // Please note that V2 sink framework will apply the uid here to the framework created + // operators like writer, + // committer. E.g. "Sink writer: + DataStreamSink rowDataDataStreamSink = + rowDataInput.sinkTo(sink).uid(suffix).name(suffix); + + // Note that IcebergSink internally consists o multiple operators (like writer, committer, + // aggregator). + // The following parallelism will be propagated to all of the above operators. + rowDataDataStreamSink.setParallelism(sink.resolveWriterParallelism(rowDataInput)); + return rowDataDataStreamSink; + } + } + + private String operatorName(String suffix) { + return uidSuffix != null ? suffix + "-" + uidSuffix : suffix; + } + + private static String defaultSuffix(String uidSuffix, String defaultSuffix) { + if (uidSuffix == null || uidSuffix.isEmpty()) { + return defaultSuffix; + } + return uidSuffix; + } + + private static SerializableTable checkAndGetTable(TableLoader tableLoader, Table table) { + if (table == null) { + if (!tableLoader.isOpen()) { + tableLoader.open(); + } + + try (TableLoader loader = tableLoader) { + return (SerializableTable) SerializableTable.copyOf(loader.loadTable()); + } catch (IOException e) { + throw new UncheckedIOException( + "Failed to load iceberg table from table loader: " + tableLoader, e); + } + } + + return (SerializableTable) SerializableTable.copyOf(table); + } + + /** + * Clean up after removing {@link Builder#tableSchema} + * + * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link #toFlinkRowType(Schema, + * ResolvedSchema)} instead. + */ + @Deprecated + private static RowType toFlinkRowType(Schema schema, TableSchema requestedSchema) { + if (requestedSchema != null) { + // Convert the flink schema to iceberg schema firstly, then reassign ids to match the existing + // iceberg schema. + Schema writeSchema = TypeUtil.reassignIds(FlinkSchemaUtil.convert(requestedSchema), schema); + TypeUtil.validateWriteSchema(schema, writeSchema, true, true); + + // We use this flink schema to read values from RowData. The flink's TINYINT and SMALLINT will + // be promoted to iceberg INTEGER, that means if we use iceberg's table schema to read TINYINT + // (backend by 1 'byte'), we will read 4 bytes rather than 1 byte, it will mess up the byte + // array in BinaryRowData. So here we must use flink schema. + return (RowType) requestedSchema.toRowDataType().getLogicalType(); + } else { + return FlinkSchemaUtil.convert(schema); + } + } + + private static RowType toFlinkRowType(Schema schema, ResolvedSchema requestedSchema) { + if (requestedSchema != null) { + // Convert the flink schema to iceberg schema firstly, then reassign ids to match the existing + // iceberg schema. + Schema writeSchema = TypeUtil.reassignIds(FlinkSchemaUtil.convert(requestedSchema), schema); + TypeUtil.validateWriteSchema(schema, writeSchema, true, true); + + // We use this flink schema to read values from RowData. The flink's TINYINT and SMALLINT will + // be promoted to iceberg INTEGER, that means if we use iceberg's table schema to read TINYINT + // (backend by 1 'byte'), we will read 4 bytes rather than 1 byte, it will mess up the byte + // array in BinaryRowData. So here we must use flink schema. + return (RowType) requestedSchema.toSinkRowDataType().getLogicalType(); + } else { + return FlinkSchemaUtil.convert(schema); + } + } + + private DataStream distributeDataStream(DataStream input) { + DistributionMode mode = flinkWriteConf.distributionMode(); + Schema schema = table.schema(); + PartitionSpec spec = table.spec(); + SortOrder sortOrder = table.sortOrder(); + + LOG.info("Write distribution mode is '{}'", mode.modeName()); + switch (mode) { + case NONE: + return distributeDataStreamByNoneDistributionMode(input, schema); + case HASH: + return distributeDataStreamByHashDistributionMode(input, schema, spec); + case RANGE: + return distributeDataStreamByRangeDistributionMode(input, schema, spec, sortOrder); + default: + throw new RuntimeException("Unrecognized " + WRITE_DISTRIBUTION_MODE + ": " + mode); + } + } + + private DataStream distributeDataStreamByNoneDistributionMode( + DataStream input, Schema iSchema) { + if (equalityFieldIds.isEmpty()) { + return input; + } else { + LOG.info("Distribute rows by equality fields, because there are equality fields set"); + return input.keyBy(new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); + } + } + + private DataStream distributeDataStreamByHashDistributionMode( + DataStream input, Schema iSchema, PartitionSpec partitionSpec) { + if (equalityFieldIds.isEmpty()) { + if (partitionSpec.isUnpartitioned()) { + LOG.warn( + "Fallback to use 'none' distribution mode, because there are no equality fields set " + + "and table is unpartitioned"); + return input; + } else { + return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); + } + } else { + if (partitionSpec.isUnpartitioned()) { + LOG.info( + "Distribute rows by equality fields, because there are equality fields set " + + "and table is unpartitioned"); + return input.keyBy(new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); + } else { + for (PartitionField partitionField : partitionSpec.fields()) { + Preconditions.checkState( + equalityFieldIds.contains(partitionField.sourceId()), + "In 'hash' distribution mode with equality fields set, source column '%s' of partition field '%s' " + + "should be included in equality fields: '%s'", + table.schema().findColumnName(partitionField.sourceId()), + partitionField, + equalityFieldColumns); + } + return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); + } + } + } + + private int resolveWriterParallelism(DataStream input) { + // if the writeParallelism is not specified, we set the default to the input parallelism to + // encourage chaining. + return Optional.ofNullable(flinkWriteConf.writeParallelism()).orElseGet(input::getParallelism); + } + + private DataStream distributeDataStreamByRangeDistributionMode( + DataStream input, + Schema iSchema, + PartitionSpec partitionSpec, + SortOrder sortOrderParam) { + + int writerParallelism = resolveWriterParallelism(input); + + // needed because of checkStyle not allowing us to change the value of an argument + SortOrder sortOrder = sortOrderParam; + + // Ideally, exception should be thrown in the combination of range distribution and + // equality fields. Primary key case should use hash distribution mode. + // Keep the current behavior of falling back to keyBy for backward compatibility. + if (!equalityFieldIds.isEmpty()) { + LOG.warn( + "Hash distribute rows by equality fields, even though {}=range is set. " + + "Range distribution for primary keys are not always safe in " + + "Flink streaming writer.", + WRITE_DISTRIBUTION_MODE); + return input.keyBy(new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); + } + + // range distribute by partition key or sort key if table has an SortOrder + Preconditions.checkState( + sortOrder.isSorted() || partitionSpec.isPartitioned(), + "Invalid write distribution mode: range. Need to define sort order or partition spec."); + if (sortOrder.isUnsorted()) { + sortOrder = Partitioning.sortOrderFor(partitionSpec); + LOG.info("Construct sort order from partition spec"); + } + + LOG.info("Range distribute rows by sort order: {}", sortOrder); + StatisticsOrRecordTypeInformation statisticsOrRecordTypeInformation = + new StatisticsOrRecordTypeInformation(flinkRowType, iSchema, sortOrder); + StatisticsType statisticsType = flinkWriteConf.rangeDistributionStatisticsType(); + SingleOutputStreamOperator shuffleStream = + input + .transform( + operatorName("range-shuffle"), + statisticsOrRecordTypeInformation, + new DataStatisticsOperatorFactory( + iSchema, + sortOrder, + writerParallelism, + statisticsType, + flinkWriteConf.rangeDistributionSortKeyBaseWeight())) + // Set the parallelism same as input operator to encourage chaining + .setParallelism(input.getParallelism()); + + if (uidSuffix != null) { + shuffleStream = shuffleStream.uid("shuffle-" + uidSuffix); + } + + return shuffleStream + .partitionCustom(new RangePartitioner(iSchema, sortOrder), r -> r) + .flatMap( + (FlatMapFunction) + (statisticsOrRecord, out) -> { + if (statisticsOrRecord.hasRecord()) { + out.collect(statisticsOrRecord.record()); + } + }) + // Set slot sharing group and the parallelism same as writerParallelism to + // promote operator chaining with the downstream writer operator + .slotSharingGroup("shuffle-partition-custom-group") + .setParallelism(writerParallelism) + .returns(RowData.class); + } + + /** + * Initialize a {@link Builder} to export the data from generic input data stream into iceberg + * table. We use {@link RowData} inside the sink connector, so users need to provide a mapper + * function and a {@link TypeInformation} to convert those generic records to a RowData + * DataStream. + * + * @param input the generic source input data stream. + * @param mapper function to convert the generic data to {@link RowData} + * @param outputType to define the {@link TypeInformation} for the input data. + * @param the data type of records. + * @return {@link Builder} to connect the iceberg table. + */ + public static Builder builderFor( + DataStream input, MapFunction mapper, TypeInformation outputType) { + return new Builder().forMapperOutputType(input, mapper, outputType); + } + + /** + * Initialize a {@link Builder} to export the data from input data stream with {@link Row}s into + * iceberg table. We use {@link RowData} inside the sink connector, so users need to provide a + * {@link TableSchema} for builder to convert those {@link Row}s to a {@link RowData} DataStream. + * + * @param input the source input data stream with {@link Row}s. + * @param tableSchema defines the {@link TypeInformation} for input data. + * @return {@link Builder} to connect the iceberg table. + * @deprecated Use {@link #forRow(DataStream, ResolvedSchema)} instead. + */ + @Deprecated + public static Builder forRow(DataStream input, TableSchema tableSchema) { + return new Builder().forRow(input, tableSchema); + } + + /** + * Initialize a {@link Builder} to export the data from input data stream with {@link Row}s into + * iceberg table. We use {@link RowData} inside the sink connector, so users need to provide a + * {@link ResolvedSchema} for builder to convert those {@link Row}s to a {@link RowData} + * DataStream. + * + * @param input the source input data stream with {@link Row}s. + * @param resolvedSchema defines the {@link TypeInformation} for input data. + * @return {@link Builder} to connect the iceberg table. + */ + public static Builder forRow(DataStream input, ResolvedSchema resolvedSchema) { + return new Builder().forRow(input, resolvedSchema); + } + + /** + * Initialize a {@link Builder} to export the data from input data stream with {@link RowData}s + * into iceberg table. + * + * @param input the source input data stream with {@link RowData}s. + * @return {@link Builder} to connect the iceberg table. + */ + public static Builder forRowData(DataStream input) { + return new Builder().forRowData(input); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkBuilder.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkBuilder.java new file mode 100644 index 000000000000..577b2b9a4227 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkBuilder.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.List; +import java.util.Map; +import org.apache.flink.annotation.Internal; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.legacy.api.TableSchema; +import org.apache.flink.types.Row; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; + +/** + * This class is for internal purpose of transition between the previous implementation of Flink's + * sink ({@link FlinkSink}) and the new one implementation based on Flink SinkV2 API ({@link + * IcebergSink}). After we remove the previous implementation, all occurrences of this class would + * be replaced by direct {@link IcebergSink} usage. + */ +@Internal +interface IcebergSinkBuilder> { + + /** + * @deprecated Use {@link #resolvedSchema(ResolvedSchema)} instead. + */ + @Deprecated + T tableSchema(TableSchema newTableSchema); + + T resolvedSchema(ResolvedSchema newResolvedSchema); + + T tableLoader(TableLoader newTableLoader); + + T equalityFieldColumns(List columns); + + T overwrite(boolean newOverwrite); + + T setAll(Map properties); + + T flinkConf(ReadableConfig config); + + T table(Table newTable); + + T writeParallelism(int newWriteParallelism); + + T distributionMode(DistributionMode mode); + + T toBranch(String branch); + + T upsert(boolean enabled); + + DataStreamSink append(); + + /** + * @deprecated Use {@link #forRow(DataStream, ResolvedSchema, boolean)} instead. + */ + @Deprecated + static IcebergSinkBuilder forRow( + DataStream input, TableSchema tableSchema, boolean useV2Sink) { + if (useV2Sink) { + return IcebergSink.forRow(input, tableSchema); + } else { + return FlinkSink.forRow(input, tableSchema); + } + } + + static IcebergSinkBuilder forRow( + DataStream input, ResolvedSchema resolvedSchema, boolean useV2Sink) { + if (useV2Sink) { + return IcebergSink.forRow(input, resolvedSchema); + } else { + return FlinkSink.forRow(input, resolvedSchema); + } + } + + static IcebergSinkBuilder forRowData(DataStream input, boolean useV2Sink) { + if (useV2Sink) { + return IcebergSink.forRowData(input); + } else { + return FlinkSink.forRowData(input); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkWriter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkWriter.java new file mode 100644 index 000000000000..7234cf74020e --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkWriter.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.IOException; +import java.util.Collection; +import java.util.concurrent.TimeUnit; +import org.apache.flink.api.connector.sink2.CommittingSinkWriter; +import org.apache.flink.api.connector.sink2.SinkWriter; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Iceberg writer implementation for the {@link SinkWriter} interface. Used by the {@link + * org.apache.iceberg.flink.sink.IcebergSink} (SinkV2). Writes out the data to the final place, and + * emits a single {@link WriteResult} at every checkpoint for every data/delete file created by this + * writer. + */ +class IcebergSinkWriter implements CommittingSinkWriter { + private static final Logger LOG = LoggerFactory.getLogger(IcebergSinkWriter.class); + + private final String fullTableName; + private final TaskWriterFactory taskWriterFactory; + private final IcebergStreamWriterMetrics metrics; + private TaskWriter writer; + private final int subTaskId; + private final int attemptId; + + IcebergSinkWriter( + String fullTableName, + TaskWriterFactory taskWriterFactory, + IcebergStreamWriterMetrics metrics, + int subTaskId, + int attemptId) { + this.fullTableName = fullTableName; + this.taskWriterFactory = taskWriterFactory; + // Initialize the task writer factory. + taskWriterFactory.initialize(subTaskId, attemptId); + // Initialize the task writer. + this.writer = taskWriterFactory.create(); + this.metrics = metrics; + this.subTaskId = subTaskId; + this.attemptId = attemptId; + LOG.debug( + "Created Stream Writer for table {} subtask {} attemptId {}", + fullTableName, + subTaskId, + attemptId); + } + + @Override + public void write(RowData element, Context context) throws IOException, InterruptedException { + writer.write(element); + } + + @Override + public void flush(boolean endOfInput) { + // flush is used to handle flush/endOfInput, so no action is taken here. + } + + @Override + public void close() throws Exception { + if (writer != null) { + writer.close(); + } + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("tableName", fullTableName) + .add("subTaskId", subTaskId) + .add("attemptId", attemptId) + .toString(); + } + + @Override + public Collection prepareCommit() throws IOException { + long startNano = System.nanoTime(); + WriteResult result = writer.complete(); + this.writer = taskWriterFactory.create(); + metrics.updateFlushResult(result); + metrics.flushDuration(TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano)); + LOG.debug( + "Iceberg writer subtask {} attempt {} flushed {} data files and {} delete files", + subTaskId, + attemptId, + result.dataFiles().length, + result.deleteFiles().length); + return Lists.newArrayList(result); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java new file mode 100644 index 000000000000..adb53af27bd7 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.IOException; +import java.util.concurrent.TimeUnit; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; + +class IcebergStreamWriter extends AbstractStreamOperator + implements OneInputStreamOperator, BoundedOneInput { + + private static final long serialVersionUID = 1L; + static final long END_INPUT_CHECKPOINT_ID = Long.MAX_VALUE; + + private final String fullTableName; + private final TaskWriterFactory taskWriterFactory; + + private transient TaskWriter writer; + private transient int subTaskId; + private transient int attemptId; + private transient IcebergStreamWriterMetrics writerMetrics; + + IcebergStreamWriter(String fullTableName, TaskWriterFactory taskWriterFactory) { + this.fullTableName = fullTableName; + this.taskWriterFactory = taskWriterFactory; + } + + @Override + public void open() { + this.subTaskId = getRuntimeContext().getTaskInfo().getIndexOfThisSubtask(); + this.attemptId = getRuntimeContext().getTaskInfo().getAttemptNumber(); + this.writerMetrics = new IcebergStreamWriterMetrics(super.metrics, fullTableName); + + // Initialize the task writer factory. + this.taskWriterFactory.initialize(subTaskId, attemptId); + + // Initialize the task writer. + this.writer = taskWriterFactory.create(); + } + + @Override + public void prepareSnapshotPreBarrier(long checkpointId) throws Exception { + flush(checkpointId); + this.writer = taskWriterFactory.create(); + } + + @Override + public void processElement(StreamRecord element) throws Exception { + writer.write(element.getValue()); + } + + @Override + public void close() throws Exception { + super.close(); + if (writer != null) { + writer.close(); + writer = null; + } + } + + @Override + public void endInput() throws IOException { + // For bounded stream, it may don't enable the checkpoint mechanism so we'd better to emit the + // remaining completed files to downstream before closing the writer so that we won't miss any + // of them. + // Note that if the task is not closed after calling endInput, checkpoint may be triggered again + // causing files to be sent repeatedly, the writer is marked as null after the last file is sent + // to guard against duplicated writes. + flush(END_INPUT_CHECKPOINT_ID); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("tableName", fullTableName) + .add("subTaskId", subTaskId) + .add("attemptId", attemptId) + .toString(); + } + + /** close all open files and emit files to downstream committer operator */ + private void flush(long checkpointId) throws IOException { + if (writer == null) { + return; + } + + long startNano = System.nanoTime(); + WriteResult result = writer.complete(); + writerMetrics.updateFlushResult(result); + output.collect(new StreamRecord<>(new FlinkWriteResult(checkpointId, result))); + writerMetrics.flushDuration(TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano)); + + // Set writer to null to prevent duplicate flushes in the corner case of + // prepareSnapshotPreBarrier happening after endInput. + writer = null; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java new file mode 100644 index 000000000000..434f3969577f --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import com.codahale.metrics.SlidingWindowReservoir; +import java.util.Arrays; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.flink.annotation.Internal; +import org.apache.flink.dropwizard.metrics.DropwizardHistogramWrapper; +import org.apache.flink.metrics.Counter; +import org.apache.flink.metrics.Histogram; +import org.apache.flink.metrics.MetricGroup; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.util.ScanTaskUtil; + +@Internal +public class IcebergStreamWriterMetrics { + // 1,024 reservoir size should cost about 8KB, which is quite small. + // It should also produce good accuracy for histogram distribution (like percentiles). + private static final int HISTOGRAM_RESERVOIR_SIZE = 1024; + + private final Counter flushedDataFiles; + private final Counter flushedDeleteFiles; + private final Counter flushedReferencedDataFiles; + private final AtomicLong lastFlushDurationMs; + private final Histogram dataFilesSizeHistogram; + private final Histogram deleteFilesSizeHistogram; + + public IcebergStreamWriterMetrics(MetricGroup metrics, String fullTableName) { + MetricGroup writerMetrics = + metrics.addGroup("IcebergStreamWriter").addGroup("table", fullTableName); + this.flushedDataFiles = writerMetrics.counter("flushedDataFiles"); + this.flushedDeleteFiles = writerMetrics.counter("flushedDeleteFiles"); + this.flushedReferencedDataFiles = writerMetrics.counter("flushedReferencedDataFiles"); + this.lastFlushDurationMs = new AtomicLong(); + writerMetrics.gauge("lastFlushDurationMs", lastFlushDurationMs::get); + + com.codahale.metrics.Histogram dropwizardDataFilesSizeHistogram = + new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); + this.dataFilesSizeHistogram = + writerMetrics.histogram( + "dataFilesSizeHistogram", + new DropwizardHistogramWrapper(dropwizardDataFilesSizeHistogram)); + com.codahale.metrics.Histogram dropwizardDeleteFilesSizeHistogram = + new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); + this.deleteFilesSizeHistogram = + writerMetrics.histogram( + "deleteFilesSizeHistogram", + new DropwizardHistogramWrapper(dropwizardDeleteFilesSizeHistogram)); + } + + public void updateFlushResult(WriteResult result) { + flushedDataFiles.inc(result.dataFiles().length); + flushedDeleteFiles.inc(result.deleteFiles().length); + flushedReferencedDataFiles.inc(result.referencedDataFiles().length); + + // For file size distribution histogram, we don't have to update them after successful commits. + // This should works equally well and we avoided the overhead of tracking the list of file sizes + // in the {@link CommitSummary}, which currently stores simple stats for counters and gauges + // metrics. + Arrays.stream(result.dataFiles()) + .forEach( + dataFile -> { + dataFilesSizeHistogram.update(dataFile.fileSizeInBytes()); + }); + Arrays.stream(result.deleteFiles()) + .forEach( + deleteFile -> { + deleteFilesSizeHistogram.update(ScanTaskUtil.contentSizeInBytes(deleteFile)); + }); + } + + public void flushDuration(long flushDurationMs) { + lastFlushDurationMs.set(flushDurationMs); + } + + public Counter getFlushedDataFiles() { + return flushedDataFiles; + } + + public Counter getFlushedDeleteFiles() { + return flushedDeleteFiles; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergWriteAggregator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergWriteAggregator.java new file mode 100644 index 000000000000..794ade577976 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergWriteAggregator.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.IOException; +import java.util.Collection; +import org.apache.flink.core.io.SimpleVersionedSerialization; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; +import org.apache.flink.streaming.api.connector.sink2.CommittableSummary; +import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Operator which aggregates the individual {@link WriteResult} objects) to a single {@link + * IcebergCommittable} per checkpoint (storing the serialized {@link + * org.apache.iceberg.flink.sink.DeltaManifests}, jobId, operatorId, checkpointId) + */ +class IcebergWriteAggregator extends AbstractStreamOperator> + implements OneInputStreamOperator< + CommittableMessage, CommittableMessage> { + private static final Logger LOG = LoggerFactory.getLogger(IcebergWriteAggregator.class); + private static final byte[] EMPTY_MANIFEST_DATA = new byte[0]; + private final Collection results; + private transient ManifestOutputFileFactory icebergManifestOutputFileFactory; + private transient Table table; + private final TableLoader tableLoader; + + IcebergWriteAggregator(TableLoader tableLoader) { + this.results = Sets.newHashSet(); + this.tableLoader = tableLoader; + } + + @Override + public void open() throws Exception { + if (!tableLoader.isOpen()) { + tableLoader.open(); + } + + String flinkJobId = getContainingTask().getEnvironment().getJobID().toString(); + String operatorId = getOperatorID().toString(); + int subTaskId = getRuntimeContext().getTaskInfo().getIndexOfThisSubtask(); + Preconditions.checkArgument( + subTaskId == 0, "The subTaskId must be zero in the IcebergWriteAggregator"); + int attemptId = getRuntimeContext().getTaskInfo().getAttemptNumber(); + this.table = tableLoader.loadTable(); + + this.icebergManifestOutputFileFactory = + FlinkManifestUtil.createOutputFileFactory( + () -> table, table.properties(), flinkJobId, operatorId, subTaskId, attemptId); + } + + @Override + public void finish() throws IOException { + prepareSnapshotPreBarrier(Long.MAX_VALUE); + } + + @Override + public void prepareSnapshotPreBarrier(long checkpointId) throws IOException { + IcebergCommittable committable = + new IcebergCommittable( + writeToManifest(results, checkpointId), + getContainingTask().getEnvironment().getJobID().toString(), + getRuntimeContext().getOperatorUniqueID(), + checkpointId); + CommittableMessage summary = + new CommittableSummary<>(0, 1, checkpointId, 1, 1, 0); + output.collect(new StreamRecord<>(summary)); + CommittableMessage message = + new CommittableWithLineage<>(committable, checkpointId, 0); + output.collect(new StreamRecord<>(message)); + LOG.info("Emitted commit message to downstream committer operator"); + results.clear(); + } + + /** + * Write all the completed data files to a newly created manifest file and return the manifest's + * avro serialized bytes. + */ + public byte[] writeToManifest(Collection writeResults, long checkpointId) + throws IOException { + if (writeResults.isEmpty()) { + return EMPTY_MANIFEST_DATA; + } + + WriteResult result = WriteResult.builder().addAll(writeResults).build(); + DeltaManifests deltaManifests = + FlinkManifestUtil.writeCompletedFiles( + result, () -> icebergManifestOutputFileFactory.create(checkpointId), table.spec()); + + return SimpleVersionedSerialization.writeVersionAndSerialize( + DeltaManifestsSerializer.INSTANCE, deltaManifests); + } + + @Override + public void processElement(StreamRecord> element) + throws Exception { + + if (element.isRecord() && element.getValue() instanceof CommittableWithLineage) { + results.add(((CommittableWithLineage) element.getValue()).getCommittable()); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java new file mode 100644 index 000000000000..6ba87bea30c2 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.Locale; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Supplier; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.HasTableOperations; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableOperations; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Strings; + +@Internal +public class ManifestOutputFileFactory { + // Users could define their own flink manifests directory by setting this value in table + // properties. + @VisibleForTesting static final String FLINK_MANIFEST_LOCATION = "flink.manifests.location"; + private final Supplier
tableSupplier; + private final Map props; + private final String flinkJobId; + private final String operatorUniqueId; + private final int subTaskId; + private final long attemptNumber; + private final AtomicInteger fileCount = new AtomicInteger(0); + + ManifestOutputFileFactory( + Supplier
tableSupplier, + Map props, + String flinkJobId, + String operatorUniqueId, + int subTaskId, + long attemptNumber) { + this.tableSupplier = tableSupplier; + this.props = props; + this.flinkJobId = flinkJobId; + this.operatorUniqueId = operatorUniqueId; + this.subTaskId = subTaskId; + this.attemptNumber = attemptNumber; + } + + private String generatePath(long checkpointId) { + return FileFormat.AVRO.addExtension( + String.format( + Locale.ROOT, + "%s-%s-%05d-%d-%d-%05d", + flinkJobId, + operatorUniqueId, + subTaskId, + attemptNumber, + checkpointId, + fileCount.incrementAndGet())); + } + + public OutputFile create(long checkpointId) { + String flinkManifestDir = props.get(FLINK_MANIFEST_LOCATION); + TableOperations ops = ((HasTableOperations) tableSupplier.get()).operations(); + + String newManifestFullPath; + if (Strings.isNullOrEmpty(flinkManifestDir)) { + // User don't specify any flink manifest directory, so just use the default metadata path. + newManifestFullPath = ops.metadataFileLocation(generatePath(checkpointId)); + } else { + newManifestFullPath = + String.format("%s/%s", stripTrailingSlash(flinkManifestDir), generatePath(checkpointId)); + } + + return tableSupplier.get().io().newOutputFile(newManifestFullPath); + } + + private static String stripTrailingSlash(String path) { + String result = path; + while (result.endsWith("/")) { + result = result.substring(0, result.length() - 1); + } + return result; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java new file mode 100644 index 000000000000..17c8233e1f6f --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.RowDataWrapper; + +/** + * Create a {@link KeySelector} to shuffle by partition key, then each partition/bucket will be + * wrote by only one task. That will reduce lots of small files in partitioned fanout write policy + * for {@link FlinkSink}. + */ +@Internal +public class PartitionKeySelector implements KeySelector { + + private final Schema schema; + private final PartitionKey partitionKey; + private final RowType flinkSchema; + + private transient RowDataWrapper rowDataWrapper; + + public PartitionKeySelector(PartitionSpec spec, Schema schema, RowType flinkSchema) { + this.schema = schema; + this.partitionKey = new PartitionKey(spec, schema); + this.flinkSchema = flinkSchema; + } + + /** + * Construct the {@link RowDataWrapper} lazily here because few members in it are not + * serializable. In this way, we don't have to serialize them with forcing. + */ + private RowDataWrapper lazyRowDataWrapper() { + if (rowDataWrapper == null) { + rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); + } + return rowDataWrapper; + } + + @Override + public String getKey(RowData row) { + partitionKey.partition(lazyRowDataWrapper().wrap(row)); + return partitionKey.toPath(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java new file mode 100644 index 000000000000..3eb4dba80281 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Map; +import java.util.Set; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.OutputFileFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.Tasks; + +class PartitionedDeltaWriter extends BaseDeltaTaskWriter { + + private final PartitionKey partitionKey; + + private final Map writers = Maps.newHashMap(); + + PartitionedDeltaWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema, + Set equalityFieldIds, + boolean upsert) { + super( + spec, + format, + appenderFactory, + fileFactory, + io, + targetFileSize, + schema, + flinkSchema, + equalityFieldIds, + upsert); + this.partitionKey = new PartitionKey(spec, schema); + } + + @Override + RowDataDeltaWriter route(RowData row) { + partitionKey.partition(wrapper().wrap(row)); + + RowDataDeltaWriter writer = writers.get(partitionKey); + if (writer == null) { + // NOTICE: we need to copy a new partition key here, in case of messing up the keys in + // writers. + PartitionKey copiedKey = partitionKey.copy(); + writer = new RowDataDeltaWriter(copiedKey); + writers.put(copiedKey, writer); + } + + return writer; + } + + @Override + public void close() { + try { + Tasks.foreach(writers.values()) + .throwFailureWhenFinished() + .noRetry() + .run(RowDataDeltaWriter::close, IOException.class); + + writers.clear(); + } catch (IOException e) { + throw new UncheckedIOException("Failed to close equality delta writer", e); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java new file mode 100644 index 000000000000..7c11b20c449d --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java @@ -0,0 +1,268 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.Collection; +import java.util.Map; +import java.util.Set; +import java.util.function.Supplier; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.OutputFileFactory; +import org.apache.iceberg.io.PartitionedFanoutWriter; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.UnpartitionedWriter; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.ArrayUtil; +import org.apache.iceberg.util.SerializableSupplier; + +public class RowDataTaskWriterFactory implements TaskWriterFactory { + private final Supplier
tableSupplier; + private final Schema schema; + private final RowType flinkSchema; + private final PartitionSpec spec; + private final long targetFileSizeBytes; + private final FileFormat format; + private final Set equalityFieldIds; + private final boolean upsert; + private final FileAppenderFactory appenderFactory; + + private transient OutputFileFactory outputFileFactory; + + public RowDataTaskWriterFactory( + Table table, + RowType flinkSchema, + long targetFileSizeBytes, + FileFormat format, + Map writeProperties, + Collection equalityFieldIds, + boolean upsert) { + this( + () -> table, + flinkSchema, + targetFileSizeBytes, + format, + writeProperties, + equalityFieldIds, + upsert); + } + + public RowDataTaskWriterFactory( + SerializableSupplier
tableSupplier, + RowType flinkSchema, + long targetFileSizeBytes, + FileFormat format, + Map writeProperties, + Collection equalityFieldIds, + boolean upsert) { + this( + tableSupplier, + flinkSchema, + targetFileSizeBytes, + format, + writeProperties, + equalityFieldIds, + upsert, + tableSupplier.get().schema(), + tableSupplier.get().spec()); + } + + public RowDataTaskWriterFactory( + SerializableSupplier
tableSupplier, + RowType flinkSchema, + long targetFileSizeBytes, + FileFormat format, + Map writeProperties, + Collection equalityFieldIds, + boolean upsert, + Schema schema, + PartitionSpec spec) { + this.tableSupplier = tableSupplier; + + Table table; + if (tableSupplier instanceof CachingTableSupplier) { + // rely on the initial table metadata for schema, etc., until schema evolution is supported + table = ((CachingTableSupplier) tableSupplier).initialTable(); + } else { + table = tableSupplier.get(); + } + + this.schema = schema; + this.flinkSchema = flinkSchema; + this.spec = spec; + this.targetFileSizeBytes = targetFileSizeBytes; + this.format = format; + this.equalityFieldIds = equalityFieldIds != null ? Sets.newHashSet(equalityFieldIds) : null; + this.upsert = upsert; + + if (equalityFieldIds == null || equalityFieldIds.isEmpty()) { + this.appenderFactory = + new FlinkAppenderFactory( + table, schema, flinkSchema, writeProperties, spec, null, null, null); + } else if (upsert) { + // In upsert mode, only the new row is emitted using INSERT row kind. Therefore, any column of + // the inserted row + // may differ from the deleted row other than the primary key fields, and the delete file must + // contain values + // that are correct for the deleted row. Therefore, only write the equality delete fields. + this.appenderFactory = + new FlinkAppenderFactory( + table, + schema, + flinkSchema, + writeProperties, + spec, + ArrayUtil.toPrimitive(equalityFieldIds.toArray(new Integer[0])), + TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)), + null); + } else { + this.appenderFactory = + new FlinkAppenderFactory( + table, + schema, + flinkSchema, + writeProperties, + spec, + ArrayUtil.toPrimitive(equalityFieldIds.toArray(new Integer[0])), + schema, + null); + } + } + + @Override + public void initialize(int taskId, int attemptId) { + Table table; + if (tableSupplier instanceof CachingTableSupplier) { + // rely on the initial table metadata for schema, etc., until schema evolution is supported + table = ((CachingTableSupplier) tableSupplier).initialTable(); + } else { + table = tableSupplier.get(); + } + + refreshTable(); + + this.outputFileFactory = + OutputFileFactory.builderFor(table, taskId, attemptId) + .format(format) + .ioSupplier(() -> tableSupplier.get().io()) + .defaultSpec(spec) + .build(); + } + + @Override + public TaskWriter create() { + Preconditions.checkNotNull( + outputFileFactory, + "The outputFileFactory shouldn't be null if we have invoked the initialize()."); + + refreshTable(); + + if (equalityFieldIds == null || equalityFieldIds.isEmpty()) { + // Initialize a task writer to write INSERT only. + if (spec.isUnpartitioned()) { + return new UnpartitionedWriter<>( + spec, + format, + appenderFactory, + outputFileFactory, + tableSupplier.get().io(), + targetFileSizeBytes); + } else { + return new RowDataPartitionedFanoutWriter( + spec, + format, + appenderFactory, + outputFileFactory, + tableSupplier.get().io(), + targetFileSizeBytes, + schema, + flinkSchema); + } + } else { + // Initialize a task writer to write both INSERT and equality DELETE. + if (spec.isUnpartitioned()) { + return new UnpartitionedDeltaWriter( + spec, + format, + appenderFactory, + outputFileFactory, + tableSupplier.get().io(), + targetFileSizeBytes, + schema, + flinkSchema, + equalityFieldIds, + upsert); + } else { + return new PartitionedDeltaWriter( + spec, + format, + appenderFactory, + outputFileFactory, + tableSupplier.get().io(), + targetFileSizeBytes, + schema, + flinkSchema, + equalityFieldIds, + upsert); + } + } + } + + void refreshTable() { + if (tableSupplier instanceof CachingTableSupplier) { + ((CachingTableSupplier) tableSupplier).refreshTable(); + } + } + + private static class RowDataPartitionedFanoutWriter extends PartitionedFanoutWriter { + + private final PartitionKey partitionKey; + private final RowDataWrapper rowDataWrapper; + + RowDataPartitionedFanoutWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema) { + super(spec, format, appenderFactory, fileFactory, io, targetFileSize); + this.partitionKey = new PartitionKey(spec, schema); + this.rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); + } + + @Override + protected PartitionKey partition(RowData row) { + partitionKey.partition(rowDataWrapper.wrap(row)); + return partitionKey; + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/SinkUtil.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/SinkUtil.java new file mode 100644 index 000000000000..b3a9ac6ba2eb --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/SinkUtil.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION; +import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.ORC_COMPRESSION; +import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY; +import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; +import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL; + +import java.util.List; +import java.util.Map; +import java.util.Set; +import javax.annotation.Nullable; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Internal +public class SinkUtil { + + private static final long INITIAL_CHECKPOINT_ID = -1L; + + public static final String FLINK_JOB_ID = "flink.job-id"; + + public static final String OPERATOR_ID = "flink.operator-id"; + public static final String MAX_COMMITTED_CHECKPOINT_ID = "flink.max-committed-checkpoint-id"; + + private SinkUtil() {} + + private static final Logger LOG = LoggerFactory.getLogger(SinkUtil.class); + + static Set checkAndGetEqualityFieldIds(Table table, List equalityFieldColumns) { + Set equalityFieldIds = Sets.newHashSet(table.schema().identifierFieldIds()); + if (equalityFieldColumns != null && !equalityFieldColumns.isEmpty()) { + Set equalityFieldSet = Sets.newHashSetWithExpectedSize(equalityFieldColumns.size()); + for (String column : equalityFieldColumns) { + org.apache.iceberg.types.Types.NestedField field = table.schema().findField(column); + Preconditions.checkNotNull( + field, + "Missing required equality field column '%s' in table schema %s", + column, + table.schema()); + equalityFieldSet.add(field.fieldId()); + } + + if (!equalityFieldSet.equals(table.schema().identifierFieldIds())) { + LOG.warn( + "The configured equality field column IDs {} are not matched with the schema identifier field IDs" + + " {}, use job specified equality field columns as the equality fields by default.", + equalityFieldSet, + table.schema().identifierFieldIds()); + } + equalityFieldIds = Sets.newHashSet(equalityFieldSet); + } + return equalityFieldIds; + } + + static long getMaxCommittedCheckpointId( + Table table, String flinkJobId, String operatorId, String branch) { + Snapshot snapshot = table.snapshot(branch); + long lastCommittedCheckpointId = INITIAL_CHECKPOINT_ID; + + while (snapshot != null) { + Map summary = snapshot.summary(); + String snapshotFlinkJobId = summary.get(FLINK_JOB_ID); + String snapshotOperatorId = summary.get(OPERATOR_ID); + if (flinkJobId.equals(snapshotFlinkJobId) + && (snapshotOperatorId == null || snapshotOperatorId.equals(operatorId))) { + String value = summary.get(MAX_COMMITTED_CHECKPOINT_ID); + if (value != null) { + lastCommittedCheckpointId = Long.parseLong(value); + break; + } + } + Long parentSnapshotId = snapshot.parentId(); + snapshot = parentSnapshotId != null ? table.snapshot(parentSnapshotId) : null; + } + + return lastCommittedCheckpointId; + } + + /** + * Based on the {@link FileFormat} overwrites the table level compression properties for the table + * write. + * + * @param format The FileFormat to use + * @param conf The write configuration + * @param table The table to get the table level settings + * @return The properties to use for writing + */ + public static Map writeProperties( + FileFormat format, FlinkWriteConf conf, @Nullable Table table) { + Map writeProperties = Maps.newHashMap(); + if (table != null) { + writeProperties.putAll(table.properties()); + } + + switch (format) { + case PARQUET: + writeProperties.put(PARQUET_COMPRESSION, conf.parquetCompressionCodec()); + String parquetCompressionLevel = conf.parquetCompressionLevel(); + if (parquetCompressionLevel != null) { + writeProperties.put(PARQUET_COMPRESSION_LEVEL, parquetCompressionLevel); + } + + break; + case AVRO: + writeProperties.put(AVRO_COMPRESSION, conf.avroCompressionCodec()); + String avroCompressionLevel = conf.avroCompressionLevel(); + if (avroCompressionLevel != null) { + writeProperties.put(AVRO_COMPRESSION_LEVEL, conf.avroCompressionLevel()); + } + + break; + case ORC: + writeProperties.put(ORC_COMPRESSION, conf.orcCompressionCodec()); + writeProperties.put(ORC_COMPRESSION_STRATEGY, conf.orcCompressionStrategy()); + break; + default: + throw new IllegalArgumentException(String.format("Unknown file format %s", format)); + } + + return writeProperties; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java new file mode 100644 index 000000000000..e3a1245e8cbd --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.Serializable; +import org.apache.iceberg.io.TaskWriter; + +/** + * Factory to create {@link TaskWriter} + * + * @param data type of record. + */ +public interface TaskWriterFactory extends Serializable { + + /** + * Initialize the factory with a given taskId and attemptId. + * + * @param taskId the identifier of task. + * @param attemptId the attempt id of this task. + */ + void initialize(int taskId, int attemptId); + + /** + * Initialize a {@link TaskWriter} with given task id and attempt id. + * + * @return a newly created task writer. + */ + TaskWriter create(); +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java new file mode 100644 index 000000000000..b6ad03514bb0 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.IOException; +import java.util.Set; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.OutputFileFactory; + +class UnpartitionedDeltaWriter extends BaseDeltaTaskWriter { + private final RowDataDeltaWriter writer; + + UnpartitionedDeltaWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema, + Set equalityFieldIds, + boolean upsert) { + super( + spec, + format, + appenderFactory, + fileFactory, + io, + targetFileSize, + schema, + flinkSchema, + equalityFieldIds, + upsert); + this.writer = new RowDataDeltaWriter(null); + } + + @Override + RowDataDeltaWriter route(RowData row) { + return writer; + } + + @Override + public void close() throws IOException { + writer.close(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/WriteResultSerializer.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/WriteResultSerializer.java new file mode 100644 index 000000000000..40a3ce0cb846 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/WriteResultSerializer.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import org.apache.flink.annotation.Internal; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputViewStreamWrapper; +import org.apache.flink.util.InstantiationUtil; +import org.apache.iceberg.io.WriteResult; + +@Internal +public class WriteResultSerializer implements SimpleVersionedSerializer { + private static final int VERSION = 1; + + @Override + public int getVersion() { + return VERSION; + } + + @Override + public byte[] serialize(WriteResult writeResult) throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + DataOutputViewStreamWrapper view = new DataOutputViewStreamWrapper(out); + byte[] result = InstantiationUtil.serializeObject(writeResult); + view.write(result); + return out.toByteArray(); + } + + @Override + public WriteResult deserialize(int version, byte[] serialized) throws IOException { + if (version == 1) { + DataInputDeserializer view = new DataInputDeserializer(serialized); + byte[] resultBuf = new byte[serialized.length]; + view.read(resultBuf); + try { + return InstantiationUtil.deserializeObject( + resultBuf, IcebergCommittableSerializer.class.getClassLoader()); + } catch (ClassNotFoundException cnc) { + throw new IOException("Could not deserialize the WriteResult object", cnc); + } + } + throw new IOException("Unrecognized version or corrupt state: " + version); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/CompareSchemasVisitor.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/CompareSchemasVisitor.java new file mode 100644 index 000000000000..41ffa609540b --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/CompareSchemasVisitor.java @@ -0,0 +1,275 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.List; +import java.util.Map; +import org.apache.iceberg.Schema; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.schema.SchemaWithPartnerVisitor; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; + +/** + * Visitor class which compares an input schema to a table schema and emits a compatibility {@link + * Result}. + * + *
    + *
  • SAME: The two schemas are semantically identical + *
  • DATA_CONVERSION_NEEDED: We can evolve the data associated with the input schema to match + * the table schema. + *
  • SCHEMA_UPDATE_NEEDED: We need to migrate the table schema to match the input schema. + *
+ * + * The input schema fields are compared to the table schema via their names. + */ +public class CompareSchemasVisitor + extends SchemaWithPartnerVisitor { + + private final Schema tableSchema; + + private CompareSchemasVisitor(Schema tableSchema) { + this.tableSchema = tableSchema; + } + + public static Result visit(Schema dataSchema, Schema tableSchema) { + return visit(dataSchema, tableSchema, true); + } + + public static Result visit(Schema dataSchema, Schema tableSchema, boolean caseSensitive) { + return visit( + dataSchema, + -1, + new CompareSchemasVisitor(tableSchema), + new PartnerIdByNameAccessors(tableSchema, caseSensitive)); + } + + @Override + public Result schema(Schema dataSchema, Integer tableSchemaId, Result downstream) { + if (tableSchemaId == null) { + return Result.SCHEMA_UPDATE_NEEDED; + } + + return downstream; + } + + @Override + public Result struct(Types.StructType struct, Integer tableSchemaId, List fields) { + if (tableSchemaId == null) { + return Result.SCHEMA_UPDATE_NEEDED; + } + + Result result = fields.stream().reduce(Result::merge).orElse(Result.SCHEMA_UPDATE_NEEDED); + + if (result == Result.SCHEMA_UPDATE_NEEDED) { + return Result.SCHEMA_UPDATE_NEEDED; + } + + Type tableSchemaType = + tableSchemaId == -1 ? tableSchema.asStruct() : tableSchema.findField(tableSchemaId).type(); + if (!tableSchemaType.isStructType()) { + return Result.SCHEMA_UPDATE_NEEDED; + } + + for (Types.NestedField tableField : tableSchemaType.asStructType().fields()) { + if (tableField.isRequired() && struct.field(tableField.name()) == null) { + // If a field from the table schema does not exist in the input schema, then we won't visit + // it and check for required/optional compatibility. The only choice is to make the table + // field optional. + return Result.SCHEMA_UPDATE_NEEDED; + } + } + + if (struct.fields().size() != tableSchemaType.asStructType().fields().size()) { + return Result.DATA_CONVERSION_NEEDED; + } + + for (int i = 0; i < struct.fields().size(); ++i) { + if (!struct + .fields() + .get(i) + .name() + .equals(tableSchemaType.asStructType().fields().get(i).name())) { + return Result.DATA_CONVERSION_NEEDED; + } + } + + return result; + } + + @Override + public Result field(Types.NestedField field, Integer tableSchemaId, Result typeResult) { + if (tableSchemaId == null) { + return Result.SCHEMA_UPDATE_NEEDED; + } + + if (typeResult != Result.SAME) { + return typeResult; + } + + if (tableSchema.findField(tableSchemaId).isRequired() && field.isOptional()) { + return Result.SCHEMA_UPDATE_NEEDED; + } else { + return Result.SAME; + } + } + + @Override + public Result list(Types.ListType list, Integer tableSchemaId, Result elementsResult) { + if (tableSchemaId == null) { + return Result.SCHEMA_UPDATE_NEEDED; + } + + return elementsResult; + } + + @Override + public Result map( + Types.MapType map, Integer tableSchemaId, Result keyResult, Result valueResult) { + if (tableSchemaId == null) { + return Result.SCHEMA_UPDATE_NEEDED; + } + + return keyResult.merge(valueResult); + } + + @Override + @SuppressWarnings("checkstyle:CyclomaticComplexity") + public Result primitive(Type.PrimitiveType primitive, Integer tableSchemaId) { + if (tableSchemaId == null) { + return Result.SCHEMA_UPDATE_NEEDED; + } + + Type tableSchemaType = tableSchema.findField(tableSchemaId).type(); + if (!tableSchemaType.isPrimitiveType()) { + return Result.SCHEMA_UPDATE_NEEDED; + } + + Type.PrimitiveType tableSchemaPrimitiveType = tableSchemaType.asPrimitiveType(); + if (primitive.equals(tableSchemaPrimitiveType)) { + return Result.SAME; + } else if (primitive.equals(Types.IntegerType.get()) + && tableSchemaPrimitiveType.equals(Types.LongType.get())) { + return Result.DATA_CONVERSION_NEEDED; + } else if (primitive.equals(Types.FloatType.get()) + && tableSchemaPrimitiveType.equals(Types.DoubleType.get())) { + return Result.DATA_CONVERSION_NEEDED; + } else if (primitive.equals(Types.DateType.get()) + && tableSchemaPrimitiveType.equals(Types.TimestampType.withoutZone())) { + return Result.DATA_CONVERSION_NEEDED; + } else if (primitive.typeId() == Type.TypeID.DECIMAL + && tableSchemaPrimitiveType.typeId() == Type.TypeID.DECIMAL) { + Types.DecimalType dataType = (Types.DecimalType) primitive; + Types.DecimalType tableType = (Types.DecimalType) tableSchemaPrimitiveType; + return dataType.scale() == tableType.scale() && dataType.precision() < tableType.precision() + ? Result.DATA_CONVERSION_NEEDED + : Result.SCHEMA_UPDATE_NEEDED; + } else { + return Result.SCHEMA_UPDATE_NEEDED; + } + } + + static class PartnerIdByNameAccessors implements PartnerAccessors { + private final Schema tableSchema; + private boolean caseSensitive = true; + + PartnerIdByNameAccessors(Schema tableSchema) { + this.tableSchema = tableSchema; + } + + private PartnerIdByNameAccessors(Schema tableSchema, boolean caseSensitive) { + this(tableSchema); + this.caseSensitive = caseSensitive; + } + + @Override + public Integer fieldPartner(Integer tableSchemaFieldId, int fieldId, String name) { + Types.StructType struct; + if (tableSchemaFieldId == -1) { + struct = tableSchema.asStruct(); + } else { + struct = tableSchema.findField(tableSchemaFieldId).type().asStructType(); + } + + Types.NestedField field = + caseSensitive ? struct.field(name) : struct.caseInsensitiveField(name); + if (field != null) { + return field.fieldId(); + } + + return null; + } + + @Override + public Integer mapKeyPartner(Integer tableSchemaMapId) { + Types.NestedField mapField = tableSchema.findField(tableSchemaMapId); + if (mapField != null) { + return mapField.type().asMapType().fields().get(0).fieldId(); + } + + return null; + } + + @Override + public Integer mapValuePartner(Integer tableSchemaMapId) { + Types.NestedField mapField = tableSchema.findField(tableSchemaMapId); + if (mapField != null) { + return mapField.type().asMapType().fields().get(1).fieldId(); + } + + return null; + } + + @Override + public Integer listElementPartner(Integer tableSchemaListId) { + Types.NestedField listField = tableSchema.findField(tableSchemaListId); + if (listField != null) { + return listField.type().asListType().fields().get(0).fieldId(); + } + + return null; + } + } + + public enum Result { + SAME(0), + DATA_CONVERSION_NEEDED(1), + SCHEMA_UPDATE_NEEDED(2); + + private static final Map BY_ID = Maps.newHashMap(); + + static { + for (Result e : Result.values()) { + if (BY_ID.put(e.id, e) != null) { + throw new IllegalArgumentException("Duplicate id: " + e.id); + } + } + } + + private final int id; + + Result(int id) { + this.id = id; + } + + private Result merge(Result other) { + return BY_ID.get(Math.max(this.id, other.id)); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DataConverter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DataConverter.java new file mode 100644 index 000000000000..34da5efd940f --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DataConverter.java @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.util.Map; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +/** + * {@link org.apache.iceberg.flink.sink.dynamic.DataConverter} is responsible to change the input + * data to make it compatible with the target schema. This is done when + * + *
    + *
  • The input schema has fewer fields than the target schema. + *
  • The table types are wider than the input type. + *
  • The field order differs for source and target schema. + *
+ * + *

The resolution is as follows: + * + *

    + *
  • In the first case, we would add a null values for the missing field (if the field is + * optional). + *
  • In the second case, we would convert the data for the input field to a wider type, e.g. int + * (input type) => long (table type). + *
  • In the third case, we would rearrange the input data to match the target table. + *
+ */ +interface DataConverter { + Object convert(Object object); + + static DataConverter identity() { + return object -> object; + } + + static DataConverter getNullable(LogicalType sourceType, LogicalType targetType) { + return nullable(get(sourceType, targetType)); + } + + static DataConverter get(LogicalType sourceType, LogicalType targetType) { + switch (targetType.getTypeRoot()) { + case BOOLEAN: + case INTEGER: + case FLOAT: + case VARCHAR: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + case BINARY: + case VARBINARY: + return object -> object; + case DOUBLE: + return object -> { + if (object instanceof Float) { + return ((Float) object).doubleValue(); + } else { + return object; + } + }; + case BIGINT: + return object -> { + if (object instanceof Integer) { + return ((Integer) object).longValue(); + } else { + return object; + } + }; + case DECIMAL: + return object -> { + DecimalType toDecimalType = (DecimalType) targetType; + DecimalData decimalData = (DecimalData) object; + if (((DecimalType) sourceType).getPrecision() == toDecimalType.getPrecision()) { + return object; + } else { + return DecimalData.fromBigDecimal( + decimalData.toBigDecimal(), toDecimalType.getPrecision(), toDecimalType.getScale()); + } + }; + case TIMESTAMP_WITHOUT_TIME_ZONE: + return object -> { + if (object instanceof Integer) { + LocalDateTime dateTime = + LocalDateTime.of(LocalDate.ofEpochDay((Integer) object), LocalTime.MIN); + return TimestampData.fromLocalDateTime(dateTime); + } else { + return object; + } + }; + case ROW: + return new RowDataConverter((RowType) sourceType, (RowType) targetType); + case ARRAY: + return new ArrayConverter((ArrayType) sourceType, (ArrayType) targetType); + case MAP: + return new MapConverter((MapType) sourceType, (MapType) targetType); + default: + throw new UnsupportedOperationException("Not a supported type: " + targetType); + } + } + + static DataConverter nullable(DataConverter converter) { + return value -> value == null ? null : converter.convert(value); + } + + class RowDataConverter implements DataConverter { + private final RowData.FieldGetter[] fieldGetters; + private final DataConverter[] dataConverters; + + RowDataConverter(RowType sourceType, RowType targetType) { + this.fieldGetters = new RowData.FieldGetter[targetType.getFields().size()]; + this.dataConverters = new DataConverter[targetType.getFields().size()]; + + for (int i = 0; i < targetType.getFields().size(); i++) { + RowData.FieldGetter fieldGetter; + DataConverter dataConverter; + RowType.RowField targetField = targetType.getFields().get(i); + int sourceFieldIndex = sourceType.getFieldIndex(targetField.getName()); + if (sourceFieldIndex == -1) { + if (targetField.getType().isNullable()) { + fieldGetter = row -> null; + dataConverter = value -> null; + } else { + throw new IllegalArgumentException( + String.format( + "Field %s in target schema %s is non-nullable but does not exist in source schema.", + i + 1, targetType)); + } + } else { + RowType.RowField sourceField = sourceType.getFields().get(sourceFieldIndex); + fieldGetter = RowData.createFieldGetter(sourceField.getType(), sourceFieldIndex); + dataConverter = DataConverter.getNullable(sourceField.getType(), targetField.getType()); + } + + this.fieldGetters[i] = fieldGetter; + this.dataConverters[i] = dataConverter; + } + } + + @Override + public RowData convert(Object object) { + RowData sourceData = (RowData) object; + GenericRowData targetData = new GenericRowData(fieldGetters.length); + for (int i = 0; i < fieldGetters.length; i++) { + Object value = fieldGetters[i].getFieldOrNull(sourceData); + targetData.setField(i, dataConverters[i].convert(value)); + } + + return targetData; + } + } + + class ArrayConverter implements DataConverter { + private final ArrayData.ElementGetter elementGetter; + private final DataConverter elementConverter; + + ArrayConverter(ArrayType sourceType, ArrayType targetType) { + this.elementGetter = ArrayData.createElementGetter(sourceType.getElementType()); + this.elementConverter = + DataConverter.getNullable(sourceType.getElementType(), targetType.getElementType()); + } + + @Override + public ArrayData convert(Object object) { + ArrayData arrayData = (ArrayData) object; + Object[] convertedArray = new Object[arrayData.size()]; + for (int i = 0; i < convertedArray.length; i++) { + Object element = elementGetter.getElementOrNull(arrayData, i); + convertedArray[i] = elementConverter.convert(element); + } + + return new GenericArrayData(convertedArray); + } + } + + class MapConverter implements DataConverter { + private final ArrayData.ElementGetter keyGetter; + private final ArrayData.ElementGetter valueGetter; + private final DataConverter keyConverter; + private final DataConverter valueConverter; + + MapConverter(MapType sourceType, MapType targetType) { + this.keyGetter = ArrayData.createElementGetter(sourceType.getKeyType()); + this.valueGetter = ArrayData.createElementGetter(sourceType.getValueType()); + this.keyConverter = + DataConverter.getNullable(sourceType.getKeyType(), targetType.getKeyType()); + this.valueConverter = + DataConverter.getNullable(sourceType.getValueType(), targetType.getValueType()); + } + + @Override + public MapData convert(Object object) { + MapData sourceData = (MapData) object; + ArrayData keyArray = sourceData.keyArray(); + ArrayData valueArray = sourceData.valueArray(); + Map convertedMap = Maps.newLinkedHashMap(); + for (int i = 0; i < keyArray.size(); ++i) { + convertedMap.put( + keyConverter.convert(keyGetter.getElementOrNull(keyArray, i)), + valueConverter.convert(valueGetter.getElementOrNull(valueArray, i))); + } + + return new GenericMapData(convertedMap); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittable.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittable.java new file mode 100644 index 000000000000..33edefe71eb0 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittable.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.Objects; +import org.apache.iceberg.flink.sink.DeltaManifests; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; + +/** + * The aggregated results of a single checkpoint which should be committed. Containing the + * serialized {@link DeltaManifests} file - which contains the commit data, and the jobId, + * operatorId, checkpointId triplet to identify the specific commit. + * + *

{@link DynamicCommittableSerializer} is used to serialize {@link DynamicCommittable} between + * the {@link DynamicWriter} and the {@link DynamicWriteResultAggregator}. + */ +class DynamicCommittable implements Serializable { + + private final WriteTarget key; + private final byte[] manifest; + private final String jobId; + private final String operatorId; + private final long checkpointId; + + DynamicCommittable( + WriteTarget key, byte[] manifest, String jobId, String operatorId, long checkpointId) { + this.key = key; + this.manifest = manifest; + this.jobId = jobId; + this.operatorId = operatorId; + this.checkpointId = checkpointId; + } + + WriteTarget key() { + return key; + } + + byte[] manifest() { + return manifest; + } + + String jobId() { + return jobId; + } + + String operatorId() { + return operatorId; + } + + long checkpointId() { + return checkpointId; + } + + @Override + public boolean equals(Object o) { + if (o == null || getClass() != o.getClass()) { + return false; + } + + DynamicCommittable that = (DynamicCommittable) o; + return checkpointId == that.checkpointId + && Objects.equals(key, that.key) + && Objects.deepEquals(manifest, that.manifest) + && Objects.equals(jobId, that.jobId) + && Objects.equals(operatorId, that.operatorId); + } + + @Override + public int hashCode() { + return Objects.hash(key, Arrays.hashCode(manifest), jobId, operatorId, checkpointId); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("key", key) + .add("jobId", jobId) + .add("checkpointId", checkpointId) + .add("operatorId", operatorId) + .toString(); + } + + public WriteTarget writeTarget() { + return key; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittableSerializer.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittableSerializer.java new file mode 100644 index 000000000000..4aadcf1f3620 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittableSerializer.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputViewStreamWrapper; + +/** + * This serializer is used for serializing the {@link DynamicCommittable} objects between the {@link + * DynamicWriter} and the {@link DynamicWriteResultAggregator} operator and for sending it down to + * the {@link DynamicCommitter}. + */ +class DynamicCommittableSerializer implements SimpleVersionedSerializer { + + private static final int VERSION = 1; + + @Override + public int getVersion() { + return VERSION; + } + + @Override + public byte[] serialize(DynamicCommittable committable) throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + DataOutputViewStreamWrapper view = new DataOutputViewStreamWrapper(out); + committable.key().serializeTo(view); + view.writeUTF(committable.jobId()); + view.writeUTF(committable.operatorId()); + view.writeLong(committable.checkpointId()); + view.writeInt(committable.manifest().length); + view.write(committable.manifest()); + return out.toByteArray(); + } + + @Override + public DynamicCommittable deserialize(int version, byte[] serialized) throws IOException { + if (version == 1) { + DataInputDeserializer view = new DataInputDeserializer(serialized); + WriteTarget key = WriteTarget.deserializeFrom(view); + String jobId = view.readUTF(); + String operatorId = view.readUTF(); + long checkpointId = view.readLong(); + int manifestLen = view.readInt(); + byte[] manifestBuf; + manifestBuf = new byte[manifestLen]; + view.read(manifestBuf); + return new DynamicCommittable(key, manifestBuf, jobId, operatorId, checkpointId); + } + + throw new IOException("Unrecognized version or corrupt state: " + version); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitter.java new file mode 100644 index 000000000000..e58066aac6ca --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitter.java @@ -0,0 +1,424 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.io.IOException; +import java.io.Serializable; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Objects; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.connector.sink2.Committer; +import org.apache.flink.core.io.SimpleVersionedSerialization; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.ReplacePartitions; +import org.apache.iceberg.RowDelta; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.SnapshotSummary; +import org.apache.iceberg.SnapshotUpdate; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.sink.CommitSummary; +import org.apache.iceberg.flink.sink.DeltaManifests; +import org.apache.iceberg.flink.sink.DeltaManifestsSerializer; +import org.apache.iceberg.flink.sink.FlinkManifestUtil; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.PropertyUtil; +import org.apache.iceberg.util.ThreadPools; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class implements the Flink SinkV2 {@link Committer} interface to implement the Iceberg + * commits. The implementation builds on the following assumptions: + * + *

    + *
  • There is a single {@link DynamicCommittable} for every table / branch / checkpoint + *
  • There is no late checkpoint - if checkpoint 'x' has received in one call, then after a + * successful run only checkpoints > x will arrive + *
  • There is no other writer which would generate another commit to the same branch with the + * same jobId-operatorId-checkpointId triplet + *
+ */ +@Internal +class DynamicCommitter implements Committer { + + private static final String MAX_COMMITTED_CHECKPOINT_ID = "flink.max-committed-checkpoint-id"; + private static final Logger LOG = LoggerFactory.getLogger(DynamicCommitter.class); + private static final byte[] EMPTY_MANIFEST_DATA = new byte[0]; + private static final WriteResult EMPTY_WRITE_RESULT = + WriteResult.builder() + .addDataFiles(Lists.newArrayList()) + .addDeleteFiles(Lists.newArrayList()) + .build(); + + private static final long INITIAL_CHECKPOINT_ID = -1L; + + @VisibleForTesting + static final String MAX_CONTINUOUS_EMPTY_COMMITS = "flink.max-continuous-empty-commits"; + + private static final String FLINK_JOB_ID = "flink.job-id"; + private static final String OPERATOR_ID = "flink.operator-id"; + private final Map snapshotProperties; + private final boolean replacePartitions; + private final DynamicCommitterMetrics committerMetrics; + private final Catalog catalog; + private final Map maxContinuousEmptyCommitsMap; + private final Map continuousEmptyCheckpointsMap; + private final ExecutorService workerPool; + + DynamicCommitter( + Catalog catalog, + Map snapshotProperties, + boolean replacePartitions, + int workerPoolSize, + String sinkId, + DynamicCommitterMetrics committerMetrics) { + this.snapshotProperties = snapshotProperties; + this.replacePartitions = replacePartitions; + this.committerMetrics = committerMetrics; + this.catalog = catalog; + this.maxContinuousEmptyCommitsMap = Maps.newHashMap(); + this.continuousEmptyCheckpointsMap = Maps.newHashMap(); + + this.workerPool = + ThreadPools.newFixedThreadPool("iceberg-committer-pool-" + sinkId, workerPoolSize); + } + + @Override + public void commit(Collection> commitRequests) + throws IOException, InterruptedException { + if (commitRequests.isEmpty()) { + return; + } + + // For every table and every checkpoint, we store the list of to-be-committed + // DynamicCommittable. + // There may be DynamicCommittable from previous checkpoints which have not been committed yet. + Map>>> commitRequestMap = + Maps.newHashMap(); + for (CommitRequest request : commitRequests) { + NavigableMap>> committables = + commitRequestMap.computeIfAbsent( + new TableKey(request.getCommittable()), unused -> Maps.newTreeMap()); + committables + .computeIfAbsent(request.getCommittable().checkpointId(), unused -> Lists.newArrayList()) + .add(request); + } + + for (Map.Entry>>> entry : + commitRequestMap.entrySet()) { + Table table = catalog.loadTable(TableIdentifier.parse(entry.getKey().tableName())); + DynamicCommittable last = entry.getValue().lastEntry().getValue().get(0).getCommittable(); + long maxCommittedCheckpointId = + getMaxCommittedCheckpointId( + table, last.jobId(), last.operatorId(), entry.getKey().branch()); + // Mark the already committed FilesCommittable(s) as finished + entry + .getValue() + .headMap(maxCommittedCheckpointId, true) + .values() + .forEach(list -> list.forEach(CommitRequest::signalAlreadyCommitted)); + NavigableMap>> uncommitted = + entry.getValue().tailMap(maxCommittedCheckpointId, false); + if (!uncommitted.isEmpty()) { + commitPendingRequests( + table, entry.getKey().branch(), uncommitted, last.jobId(), last.operatorId()); + } + } + } + + private static long getMaxCommittedCheckpointId( + Table table, String flinkJobId, String operatorId, String branch) { + Snapshot snapshot = table.snapshot(branch); + long lastCommittedCheckpointId = INITIAL_CHECKPOINT_ID; + + while (snapshot != null) { + Map summary = snapshot.summary(); + String snapshotFlinkJobId = summary.get(FLINK_JOB_ID); + String snapshotOperatorId = summary.get(OPERATOR_ID); + if (flinkJobId.equals(snapshotFlinkJobId) + && (snapshotOperatorId == null || snapshotOperatorId.equals(operatorId))) { + String value = summary.get(MAX_COMMITTED_CHECKPOINT_ID); + if (value != null) { + lastCommittedCheckpointId = Long.parseLong(value); + break; + } + } + + Long parentSnapshotId = snapshot.parentId(); + snapshot = parentSnapshotId != null ? table.snapshot(parentSnapshotId) : null; + } + + return lastCommittedCheckpointId; + } + + /** + * Commits the data to the Iceberg table by reading the file data from the {@link DeltaManifests} + * ordered by the checkpointId, and writing the new snapshot to the Iceberg table. The {@link + * SnapshotSummary} will contain the jobId, snapshotId, checkpointId so in case of job restart we + * can identify which changes are committed, and which are still waiting for the commit. + * + * @param commitRequestMap The checkpointId to {@link CommitRequest} map of the changes to commit + * @param newFlinkJobId The jobId to store in the {@link SnapshotSummary} + * @param operatorId The operatorId to store in the {@link SnapshotSummary} + * @throws IOException On commit failure + */ + private void commitPendingRequests( + Table table, + String branch, + NavigableMap>> commitRequestMap, + String newFlinkJobId, + String operatorId) + throws IOException { + long checkpointId = commitRequestMap.lastKey(); + List manifests = Lists.newArrayList(); + NavigableMap> pendingResults = Maps.newTreeMap(); + for (Map.Entry>> e : commitRequestMap.entrySet()) { + for (CommitRequest committable : e.getValue()) { + if (Arrays.equals(EMPTY_MANIFEST_DATA, committable.getCommittable().manifest())) { + pendingResults + .computeIfAbsent(e.getKey(), unused -> Lists.newArrayList()) + .add(EMPTY_WRITE_RESULT); + } else { + DeltaManifests deltaManifests = + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, committable.getCommittable().manifest()); + pendingResults + .computeIfAbsent(e.getKey(), unused -> Lists.newArrayList()) + .add(FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io(), table.specs())); + manifests.addAll(deltaManifests.manifests()); + } + } + } + + CommitSummary summary = new CommitSummary(); + summary.addAll(pendingResults); + commitPendingResult(table, branch, pendingResults, summary, newFlinkJobId, operatorId); + if (committerMetrics != null) { + committerMetrics.updateCommitSummary(table.name(), summary); + } + + FlinkManifestUtil.deleteCommittedManifests(table, manifests, newFlinkJobId, checkpointId); + } + + private void commitPendingResult( + Table table, + String branch, + NavigableMap> pendingResults, + CommitSummary summary, + String newFlinkJobId, + String operatorId) { + long totalFiles = summary.dataFilesCount() + summary.deleteFilesCount(); + TableKey key = new TableKey(table.name(), branch); + int continuousEmptyCheckpoints = + continuousEmptyCheckpointsMap.computeIfAbsent(key, unused -> 0); + int maxContinuousEmptyCommits = + maxContinuousEmptyCommitsMap.computeIfAbsent( + key, + unused -> { + int result = + PropertyUtil.propertyAsInt(table.properties(), MAX_CONTINUOUS_EMPTY_COMMITS, 10); + Preconditions.checkArgument( + result > 0, MAX_CONTINUOUS_EMPTY_COMMITS + " must be positive"); + return result; + }); + continuousEmptyCheckpoints = totalFiles == 0 ? continuousEmptyCheckpoints + 1 : 0; + if (totalFiles != 0 || continuousEmptyCheckpoints % maxContinuousEmptyCommits == 0) { + if (replacePartitions) { + replacePartitions(table, branch, pendingResults, summary, newFlinkJobId, operatorId); + } else { + commitDeltaTxn(table, branch, pendingResults, summary, newFlinkJobId, operatorId); + } + + continuousEmptyCheckpoints = 0; + } else { + long checkpointId = pendingResults.lastKey(); + LOG.info("Skip commit for checkpoint {} due to no data files or delete files.", checkpointId); + } + + continuousEmptyCheckpointsMap.put(key, continuousEmptyCheckpoints); + } + + private void replacePartitions( + Table table, + String branch, + NavigableMap> pendingResults, + CommitSummary summary, + String newFlinkJobId, + String operatorId) { + for (Map.Entry> e : pendingResults.entrySet()) { + // We don't commit the merged result into a single transaction because for the sequential + // transaction txn1 and txn2, the equality-delete files of txn2 are required to be applied + // to data files from txn1. Committing the merged one will lead to the incorrect delete + // semantic. + for (WriteResult result : e.getValue()) { + ReplacePartitions dynamicOverwrite = + table.newReplacePartitions().scanManifestsWith(workerPool); + Arrays.stream(result.dataFiles()).forEach(dynamicOverwrite::addFile); + commitOperation( + table, + branch, + dynamicOverwrite, + summary, + "dynamic partition overwrite", + newFlinkJobId, + operatorId, + e.getKey()); + } + } + } + + private void commitDeltaTxn( + Table table, + String branch, + NavigableMap> pendingResults, + CommitSummary summary, + String newFlinkJobId, + String operatorId) { + for (Map.Entry> e : pendingResults.entrySet()) { + // We don't commit the merged result into a single transaction because for the sequential + // transaction txn1 and txn2, the equality-delete files of txn2 are required to be applied + // to data files from txn1. Committing the merged one will lead to the incorrect delete + // semantic. + for (WriteResult result : e.getValue()) { + // Row delta validations are not needed for streaming changes that write equality deletes. + // Equality deletes are applied to data in all previous sequence numbers, so retries may + // push deletes further in the future, but do not affect correctness. Position deletes + // committed to the table in this path are used only to delete rows from data files that are + // being added in this commit. There is no way for data files added along with the delete + // files to be concurrently removed, so there is no need to validate the files referenced by + // the position delete files that are being committed. + RowDelta rowDelta = table.newRowDelta().scanManifestsWith(workerPool); + + Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows); + Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); + commitOperation( + table, branch, rowDelta, summary, "rowDelta", newFlinkJobId, operatorId, e.getKey()); + } + } + } + + @VisibleForTesting + void commitOperation( + Table table, + String branch, + SnapshotUpdate operation, + CommitSummary summary, + String description, + String newFlinkJobId, + String operatorId, + long checkpointId) { + + LOG.info( + "Committing {} for checkpoint {} to table {} branch {} with summary: {}", + description, + checkpointId, + table.name(), + branch, + summary); + snapshotProperties.forEach(operation::set); + // custom snapshot metadata properties will be overridden if they conflict with internal ones + // used by the sink. + operation.set(MAX_COMMITTED_CHECKPOINT_ID, Long.toString(checkpointId)); + operation.set(FLINK_JOB_ID, newFlinkJobId); + operation.set(OPERATOR_ID, operatorId); + operation.toBranch(branch); + + long startNano = System.nanoTime(); + operation.commit(); // abort is automatically called if this fails. + long durationMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano); + LOG.info( + "Committed {} to table: {}, branch: {}, checkpointId {} in {} ms", + description, + table.name(), + branch, + checkpointId, + durationMs); + if (committerMetrics != null) { + committerMetrics.commitDuration(table.name(), durationMs); + } + } + + @Override + public void close() throws IOException { + workerPool.shutdown(); + } + + private static class TableKey implements Serializable { + private String tableName; + private String branch; + + TableKey(String tableName, String branch) { + this.tableName = tableName; + this.branch = branch; + } + + TableKey(DynamicCommittable committable) { + this.tableName = committable.key().tableName(); + this.branch = committable.key().branch(); + } + + String tableName() { + return tableName; + } + + String branch() { + return branch; + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + + if (other == null || getClass() != other.getClass()) { + return false; + } + + TableKey that = (TableKey) other; + return tableName.equals(that.tableName) && branch.equals(that.branch); + } + + @Override + public int hashCode() { + return Objects.hash(tableName, branch); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("tableName", tableName) + .add("branch", branch) + .toString(); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitterMetrics.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitterMetrics.java new file mode 100644 index 000000000000..d34feea75285 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitterMetrics.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.Map; +import org.apache.flink.metrics.MetricGroup; +import org.apache.iceberg.flink.sink.CommitSummary; +import org.apache.iceberg.flink.sink.IcebergFilesCommitterMetrics; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +class DynamicCommitterMetrics { + + private final Map metrics; + private final MetricGroup mainMetricsGroup; + + DynamicCommitterMetrics(MetricGroup mainMetricsGroup) { + this.mainMetricsGroup = mainMetricsGroup; + this.metrics = Maps.newHashMap(); + } + + public void commitDuration(String fullTableName, long commitDurationMs) { + committerMetrics(fullTableName).commitDuration(commitDurationMs); + } + + /** This is called upon a successful commit. */ + public void updateCommitSummary(String fullTableName, CommitSummary stats) { + committerMetrics(fullTableName).updateCommitSummary(stats); + } + + private IcebergFilesCommitterMetrics committerMetrics(String fullTableName) { + return metrics.computeIfAbsent( + fullTableName, tableName -> new IcebergFilesCommitterMetrics(mainMetricsGroup, tableName)); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java new file mode 100644 index 000000000000..2715a01608d6 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java @@ -0,0 +1,427 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.Map; +import java.util.Optional; +import java.util.UUID; +import org.apache.flink.annotation.Experimental; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.sink2.Committer; +import org.apache.flink.api.connector.sink2.CommitterInitContext; +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.sink2.SinkWriter; +import org.apache.flink.api.connector.sink2.SupportsCommitter; +import org.apache.flink.api.connector.sink2.WriterInitContext; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessageTypeInfo; +import org.apache.flink.streaming.api.connector.sink2.CommittableSummary; +import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; +import org.apache.flink.streaming.api.connector.sink2.SupportsPostCommitTopology; +import org.apache.flink.streaming.api.connector.sink2.SupportsPreCommitTopology; +import org.apache.flink.streaming.api.connector.sink2.SupportsPreWriteTopology; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.table.data.RowData; +import org.apache.flink.util.OutputTag; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.CatalogLoader; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.flink.sink.IcebergSink; +import org.apache.iceberg.flink.sink.SinkUtil; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +/** + * Dynamic version of the IcebergSink which supports: + * + *
    + *
  • Writing to any number of tables (No more 1:1 sink/topic relationship). + *
  • Creating and updating tables based on the user-supplied routing. + *
  • Updating the schema and partition spec of tables based on the user-supplied specification. + *
+ */ +@Experimental +public class DynamicIcebergSink + implements Sink, + SupportsPreWriteTopology, + SupportsCommitter, + SupportsPreCommitTopology, + SupportsPostCommitTopology { + + private final CatalogLoader catalogLoader; + private final Map snapshotProperties; + private final String uidPrefix; + private final String sinkId; + private final Map writeProperties; + private final transient FlinkWriteConf flinkWriteConf; + private final FileFormat dataFileFormat; + private final long targetDataFileSize; + private final boolean overwriteMode; + private final int workerPoolSize; + private final int cacheMaximumSize; + + DynamicIcebergSink( + CatalogLoader catalogLoader, + Map snapshotProperties, + String uidPrefix, + Map writeProperties, + FlinkWriteConf flinkWriteConf, + int cacheMaximumSize) { + this.catalogLoader = catalogLoader; + this.snapshotProperties = snapshotProperties; + this.uidPrefix = uidPrefix; + this.writeProperties = writeProperties; + this.flinkWriteConf = flinkWriteConf; + this.dataFileFormat = flinkWriteConf.dataFileFormat(); + this.targetDataFileSize = flinkWriteConf.targetDataFileSize(); + this.overwriteMode = flinkWriteConf.overwriteMode(); + this.workerPoolSize = flinkWriteConf.workerPoolSize(); + this.cacheMaximumSize = cacheMaximumSize; + // We generate a random UUID every time when a sink is created. + // This is used to separate files generated by different sinks writing the same table. + // Also used to generate the aggregator operator name + this.sinkId = UUID.randomUUID().toString(); + } + + @Override + public SinkWriter createWriter(WriterInitContext context) { + return new DynamicWriter( + catalogLoader.loadCatalog(), + dataFileFormat, + targetDataFileSize, + writeProperties, + cacheMaximumSize, + new DynamicWriterMetrics(context.metricGroup()), + context.getTaskInfo().getIndexOfThisSubtask(), + context.getTaskInfo().getAttemptNumber()); + } + + @Override + public Committer createCommitter(CommitterInitContext context) { + DynamicCommitterMetrics metrics = new DynamicCommitterMetrics(context.metricGroup()); + return new DynamicCommitter( + catalogLoader.loadCatalog(), + snapshotProperties, + overwriteMode, + workerPoolSize, + sinkId, + metrics); + } + + @Override + public SimpleVersionedSerializer getCommittableSerializer() { + return new DynamicCommittableSerializer(); + } + + @Override + public void addPostCommitTopology( + DataStream> committables) {} + + @Override + public DataStream addPreWriteTopology( + DataStream inputDataStream) { + return distributeDataStream(inputDataStream); + } + + @Override + public DataStream> addPreCommitTopology( + DataStream> writeResults) { + TypeInformation> typeInformation = + CommittableMessageTypeInfo.of(this::getCommittableSerializer); + + return writeResults + .keyBy( + committable -> { + if (committable instanceof CommittableSummary) { + return "__summary"; + } else { + CommittableWithLineage result = + (CommittableWithLineage) committable; + return result.getCommittable().key().tableName(); + } + }) + .transform( + prefixIfNotNull(uidPrefix, sinkId + " Pre Commit"), + typeInformation, + new DynamicWriteResultAggregator(catalogLoader)) + .uid(prefixIfNotNull(uidPrefix, sinkId + "-pre-commit-topology")); + } + + @Override + public SimpleVersionedSerializer getWriteResultSerializer() { + return new DynamicWriteResultSerializer(); + } + + public static class Builder { + private DataStream input; + private DynamicRecordGenerator generator; + private CatalogLoader catalogLoader; + private String uidPrefix = null; + private final Map writeOptions = Maps.newHashMap(); + private final Map snapshotSummary = Maps.newHashMap(); + private ReadableConfig readableConfig = new Configuration(); + private boolean immediateUpdate = false; + private int cacheMaximumSize = 100; + private long cacheRefreshMs = 1_000; + private int inputSchemasPerTableCacheMaximumSize = 10; + + Builder() {} + + public Builder forInput(DataStream inputStream) { + this.input = inputStream; + return this; + } + + public Builder generator(DynamicRecordGenerator inputGenerator) { + this.generator = inputGenerator; + return this; + } + + /** + * The catalog loader is used for loading tables in {@link DynamicCommitter} lazily, we need + * this loader because {@link Table} is not serializable and could not just use the loaded table + * from Builder#table in the remote task manager. + * + * @param newCatalogLoader to load iceberg table inside tasks. + * @return {@link Builder} to connect the iceberg table. + */ + public Builder catalogLoader(CatalogLoader newCatalogLoader) { + this.catalogLoader = newCatalogLoader; + return this; + } + + /** + * Set the write properties for IcebergSink. View the supported properties in {@link + * FlinkWriteOptions} + */ + public Builder set(String property, String value) { + writeOptions.put(property, value); + return this; + } + + /** + * Set the write properties for IcebergSink. View the supported properties in {@link + * FlinkWriteOptions} + */ + public Builder setAll(Map properties) { + writeOptions.putAll(properties); + return this; + } + + public Builder overwrite(boolean newOverwrite) { + writeOptions.put(FlinkWriteOptions.OVERWRITE_MODE.key(), Boolean.toString(newOverwrite)); + return this; + } + + public Builder flinkConf(ReadableConfig config) { + this.readableConfig = config; + return this; + } + + /** + * Configuring the write parallel number for iceberg stream writer. + * + * @param newWriteParallelism the number of parallel iceberg stream writer. + * @return {@link DynamicIcebergSink.Builder} to connect the iceberg table. + */ + public Builder writeParallelism(int newWriteParallelism) { + writeOptions.put( + FlinkWriteOptions.WRITE_PARALLELISM.key(), Integer.toString(newWriteParallelism)); + return this; + } + + /** + * Set the uid prefix for IcebergSink operators. Note that IcebergSink internally consists of + * multiple operators (like writer, committer, aggregator) Actual operator uid will be appended + * with a suffix like "uidPrefix-writer". + * + *

If provided, this prefix is also applied to operator names. + * + *

Flink auto generates operator uid if not set explicitly. It is a recommended + * best-practice to set uid for all operators before deploying to production. Flink has an + * option to {@code pipeline.auto-generate-uid=false} to disable auto-generation and force + * explicit setting of all operator uid. + * + *

Be careful with setting this for an existing job, because now we are changing the operator + * uid from an auto-generated one to this new value. When deploying the change with a + * checkpoint, Flink won't be able to restore the previous IcebergSink operator state (more + * specifically the committer operator state). You need to use {@code --allowNonRestoredState} + * to ignore the previous sink state. During restore IcebergSink state is used to check if last + * commit was actually successful or not. {@code --allowNonRestoredState} can lead to data loss + * if the Iceberg commit failed in the last completed checkpoint. + * + * @param newPrefix prefix for Flink sink operator uid and name + * @return {@link Builder} to connect the iceberg table. + */ + public Builder uidPrefix(String newPrefix) { + this.uidPrefix = newPrefix; + return this; + } + + public Builder snapshotProperties(Map properties) { + snapshotSummary.putAll(properties); + return this; + } + + public Builder setSnapshotProperty(String property, String value) { + snapshotSummary.put(property, value); + return this; + } + + public Builder toBranch(String branch) { + writeOptions.put(FlinkWriteOptions.BRANCH.key(), branch); + return this; + } + + public Builder immediateTableUpdate(boolean newImmediateUpdate) { + this.immediateUpdate = newImmediateUpdate; + return this; + } + + /** Maximum size of the caches used in Dynamic Sink for table data and serializers. */ + public Builder cacheMaxSize(int maxSize) { + this.cacheMaximumSize = maxSize; + return this; + } + + /** Maximum interval for cache items renewals. */ + public Builder cacheRefreshMs(long refreshMs) { + this.cacheRefreshMs = refreshMs; + return this; + } + + /** + * Maximum input {@link org.apache.iceberg.Schema} objects to cache per each Iceberg table. The + * cache improves Dynamic Sink performance by storing {@link org.apache.iceberg.Schema} + * comparison results. + */ + public Builder inputSchemasPerTableCacheMaxSize(int inputSchemasPerTableCacheMaxSize) { + this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaxSize; + return this; + } + + private String operatorName(String suffix) { + return uidPrefix != null ? uidPrefix + "-" + suffix : suffix; + } + + private DynamicIcebergSink build() { + + Preconditions.checkArgument( + generator != null, "Please use withGenerator() to convert the input DataStream."); + Preconditions.checkNotNull(catalogLoader, "Catalog loader shouldn't be null"); + + FlinkWriteConf flinkWriteConf = new FlinkWriteConf(writeOptions, readableConfig); + Map writeProperties = + SinkUtil.writeProperties(flinkWriteConf.dataFileFormat(), flinkWriteConf, null); + uidPrefix = Optional.ofNullable(uidPrefix).orElse(""); + + return instantiateSink(writeProperties, flinkWriteConf); + } + + @VisibleForTesting + DynamicIcebergSink instantiateSink( + Map writeProperties, FlinkWriteConf flinkWriteConf) { + return new DynamicIcebergSink( + catalogLoader, + snapshotSummary, + uidPrefix, + writeProperties, + flinkWriteConf, + cacheMaximumSize); + } + + /** + * Append the iceberg sink operators to write records to iceberg table. + * + * @return {@link DataStreamSink} for sink. + */ + public DataStreamSink append() { + DynamicRecordInternalType type = + new DynamicRecordInternalType(catalogLoader, false, cacheMaximumSize); + DynamicIcebergSink sink = build(); + SingleOutputStreamOperator converted = + input + .process( + new DynamicRecordProcessor<>( + generator, + catalogLoader, + immediateUpdate, + cacheMaximumSize, + cacheRefreshMs, + inputSchemasPerTableCacheMaximumSize)) + .uid(prefixIfNotNull(uidPrefix, "-generator")) + .name(operatorName("generator")) + .returns(type); + + DataStreamSink rowDataDataStreamSink = + converted + .getSideOutput( + new OutputTag<>( + DynamicRecordProcessor.DYNAMIC_TABLE_UPDATE_STREAM, + new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize))) + .keyBy((KeySelector) DynamicRecordInternal::tableName) + .map( + new DynamicTableUpdateOperator( + catalogLoader, + cacheMaximumSize, + cacheRefreshMs, + inputSchemasPerTableCacheMaximumSize)) + .uid(prefixIfNotNull(uidPrefix, "-updater")) + .name(operatorName("Updater")) + .returns(type) + .union(converted) + .sinkTo(sink) + .uid(prefixIfNotNull(uidPrefix, "-sink")); + if (sink.flinkWriteConf.writeParallelism() != null) { + rowDataDataStreamSink.setParallelism(sink.flinkWriteConf.writeParallelism()); + } + + return rowDataDataStreamSink; + } + } + + DataStream distributeDataStream(DataStream input) { + return input.keyBy(DynamicRecordInternal::writerKey); + } + + private static String prefixIfNotNull(String uidPrefix, String suffix) { + return uidPrefix != null ? uidPrefix + "-" + suffix : suffix; + } + + /** + * Initialize a {@link IcebergSink.Builder} to export the data from input data stream with {@link + * RowData}s into iceberg table. + * + * @param input the source input data stream with {@link RowData}s. + * @return {@link IcebergSink.Builder} to connect the iceberg table. + */ + public static Builder forInput(DataStream input) { + return new Builder().forInput(input); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java new file mode 100644 index 000000000000..600a4d8b950c --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.Set; +import javax.annotation.Nullable; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.catalog.TableIdentifier; + +/** A DynamicRecord contains RowData alongside with the Iceberg table metadata. */ +public class DynamicRecord { + + private TableIdentifier tableIdentifier; + private String branch; + private Schema schema; + private RowData rowData; + private PartitionSpec partitionSpec; + private DistributionMode distributionMode; + private int writeParallelism; + private boolean upsertMode; + @Nullable private Set equalityFields; + + public DynamicRecord( + TableIdentifier tableIdentifier, + String branch, + Schema schema, + RowData rowData, + PartitionSpec partitionSpec, + DistributionMode distributionMode, + int writeParallelism) { + this.tableIdentifier = tableIdentifier; + this.branch = branch; + this.schema = schema; + this.partitionSpec = partitionSpec; + this.rowData = rowData; + this.distributionMode = distributionMode; + this.writeParallelism = writeParallelism; + } + + public TableIdentifier tableIdentifier() { + return tableIdentifier; + } + + public void setTableIdentifier(TableIdentifier tableIdentifier) { + this.tableIdentifier = tableIdentifier; + } + + public String branch() { + return branch; + } + + public void setBranch(String branch) { + this.branch = branch; + } + + public Schema schema() { + return schema; + } + + public void setSchema(Schema schema) { + this.schema = schema; + } + + public PartitionSpec spec() { + return partitionSpec; + } + + public void setPartitionSpec(PartitionSpec partitionSpec) { + this.partitionSpec = partitionSpec; + } + + public RowData rowData() { + return rowData; + } + + public void setRowData(RowData rowData) { + this.rowData = rowData; + } + + public DistributionMode distributionMode() { + return distributionMode; + } + + public void setDistributionMode(DistributionMode distributionMode) { + this.distributionMode = distributionMode; + } + + public int writeParallelism() { + return writeParallelism; + } + + public void writeParallelism(int parallelism) { + this.writeParallelism = parallelism; + } + + public boolean upsertMode() { + return upsertMode; + } + + public void setUpsertMode(boolean upsertMode) { + this.upsertMode = upsertMode; + } + + public Set equalityFields() { + return equalityFields; + } + + public void setEqualityFields(Set equalityFields) { + this.equalityFields = equalityFields; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordGenerator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordGenerator.java new file mode 100644 index 000000000000..23319b37d1ba --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordGenerator.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.io.Serializable; +import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.util.Collector; + +/** A generator to yield {@link DynamicRecord} from the provided input. */ +public interface DynamicRecordGenerator extends Serializable { + default void open(OpenContext openContext) throws Exception {} + + /** + * Takes the user-defined input and yields zero, one, or multiple {@link DynamicRecord}s using the + * {@link Collector}. + */ + void generate(T inputRecord, Collector out) throws Exception; +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternal.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternal.java new file mode 100644 index 000000000000..fe1f4cdac9b7 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternal.java @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.Objects; +import java.util.Set; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.RowDataSerializer; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; + +@Internal +class DynamicRecordInternal { + + private String tableName; + private String branch; + private Schema schema; + private PartitionSpec spec; + private int writerKey; + private RowData rowData; + private boolean upsertMode; + private Set equalityFieldIds; + + // Required for serialization instantiation + DynamicRecordInternal() {} + + DynamicRecordInternal( + String tableName, + String branch, + Schema schema, + RowData rowData, + PartitionSpec spec, + int writerKey, + boolean upsertMode, + Set equalityFieldsIds) { + this.tableName = tableName; + this.branch = branch; + this.schema = schema; + this.spec = spec; + this.writerKey = writerKey; + this.rowData = rowData; + this.upsertMode = upsertMode; + this.equalityFieldIds = equalityFieldsIds; + } + + public String tableName() { + return tableName; + } + + public void setTableName(String tableName) { + this.tableName = tableName; + } + + public String branch() { + return branch; + } + + public void setBranch(String branch) { + this.branch = branch; + } + + public Schema schema() { + return schema; + } + + public void setSchema(Schema schema) { + this.schema = schema; + } + + public RowData rowData() { + return rowData; + } + + public void setRowData(RowData rowData) { + this.rowData = rowData; + } + + public PartitionSpec spec() { + return spec; + } + + public void setSpec(PartitionSpec spec) { + this.spec = spec; + } + + public int writerKey() { + return writerKey; + } + + public void setWriterKey(int writerKey) { + this.writerKey = writerKey; + } + + public boolean upsertMode() { + return upsertMode; + } + + public void setUpsertMode(boolean upsertMode) { + this.upsertMode = upsertMode; + } + + public Set equalityFields() { + return equalityFieldIds; + } + + public void setEqualityFieldIds(Set equalityFieldIds) { + this.equalityFieldIds = equalityFieldIds; + } + + @Override + public int hashCode() { + return Objects.hash( + tableName, branch, schema, spec, writerKey, rowData, upsertMode, equalityFieldIds); + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + + if (other == null || getClass() != other.getClass()) { + return false; + } + + DynamicRecordInternal that = (DynamicRecordInternal) other; + boolean tableFieldsMatch = + Objects.equals(tableName, that.tableName) + && Objects.equals(branch, that.branch) + && schema.schemaId() == that.schema.schemaId() + && Objects.equals(spec, that.spec) + && writerKey == that.writerKey + && upsertMode == that.upsertMode + && Objects.equals(equalityFieldIds, that.equalityFieldIds); + if (!tableFieldsMatch) { + return false; + } + + if (rowData.getClass().equals(that.rowData.getClass())) { + return Objects.equals(rowData, that.rowData); + } else { + RowDataSerializer rowDataSerializer = new RowDataSerializer(FlinkSchemaUtil.convert(schema)); + return rowDataSerializer + .toBinaryRow(rowData) + .equals(rowDataSerializer.toBinaryRow(that.rowData)); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializer.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializer.java new file mode 100644 index 000000000000..b139d9a898bf --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializer.java @@ -0,0 +1,295 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.io.IOException; +import java.util.Collections; +import java.util.Set; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.api.common.typeutils.TypeSerializerSchemaCompatibility; +import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; +import org.apache.flink.api.java.tuple.Tuple3; +import org.apache.flink.core.memory.DataInputView; +import org.apache.flink.core.memory.DataOutputView; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.RowDataSerializer; +import org.apache.hadoop.util.Sets; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.PartitionSpecParser; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; + +@Internal +class DynamicRecordInternalSerializer extends TypeSerializer { + + private static final long serialVersionUID = 1L; + + private final TableSerializerCache serializerCache; + private final boolean writeSchemaAndSpec; + + DynamicRecordInternalSerializer( + TableSerializerCache serializerCache, boolean writeSchemaAndSpec) { + this.serializerCache = serializerCache; + this.writeSchemaAndSpec = writeSchemaAndSpec; + } + + @Override + public TypeSerializer duplicate() { + return new DynamicRecordInternalSerializer( + new TableSerializerCache(serializerCache.catalogLoader(), serializerCache.maximumSize()), + writeSchemaAndSpec); + } + + @Override + public DynamicRecordInternal createInstance() { + return new DynamicRecordInternal(); + } + + @Override + public void serialize(DynamicRecordInternal toSerialize, DataOutputView dataOutputView) + throws IOException { + dataOutputView.writeUTF(toSerialize.tableName()); + dataOutputView.writeUTF(toSerialize.branch()); + if (writeSchemaAndSpec) { + dataOutputView.writeUTF(SchemaParser.toJson(toSerialize.schema())); + dataOutputView.writeUTF(PartitionSpecParser.toJson(toSerialize.spec())); + } else { + dataOutputView.writeInt(toSerialize.schema().schemaId()); + dataOutputView.writeInt(toSerialize.spec().specId()); + } + + dataOutputView.writeInt(toSerialize.writerKey()); + final RowDataSerializer rowDataSerializer; + if (writeSchemaAndSpec) { + rowDataSerializer = + serializerCache.serializer( + toSerialize.tableName(), toSerialize.schema(), toSerialize.spec()); + } else { + // Check that the schema id can be resolved. Not strictly necessary for serialization. + Tuple3 serializer = + serializerCache.serializerWithSchemaAndSpec( + toSerialize.tableName(), + toSerialize.schema().schemaId(), + toSerialize.spec().specId()); + rowDataSerializer = serializer.f0; + } + + rowDataSerializer.serialize(toSerialize.rowData(), dataOutputView); + dataOutputView.writeBoolean(toSerialize.upsertMode()); + dataOutputView.writeInt(toSerialize.equalityFields().size()); + for (Integer equalityField : toSerialize.equalityFields()) { + dataOutputView.writeInt(equalityField); + } + } + + @Override + public DynamicRecordInternal deserialize(DataInputView dataInputView) throws IOException { + String tableName = dataInputView.readUTF(); + String branch = dataInputView.readUTF(); + + final Schema schema; + final PartitionSpec spec; + final RowDataSerializer rowDataSerializer; + if (writeSchemaAndSpec) { + schema = SchemaParser.fromJson(dataInputView.readUTF()); + spec = PartitionSpecParser.fromJson(schema, dataInputView.readUTF()); + rowDataSerializer = serializerCache.serializer(tableName, schema, spec); + } else { + Integer schemaId = dataInputView.readInt(); + Integer specId = dataInputView.readInt(); + Tuple3 serializerWithSchemaAndSpec = + serializerCache.serializerWithSchemaAndSpec(tableName, schemaId, specId); + schema = serializerWithSchemaAndSpec.f1; + spec = serializerWithSchemaAndSpec.f2; + rowDataSerializer = serializerWithSchemaAndSpec.f0; + } + + int writerKey = dataInputView.readInt(); + RowData rowData = rowDataSerializer.deserialize(dataInputView); + boolean upsertMode = dataInputView.readBoolean(); + int numEqualityFields = dataInputView.readInt(); + final Set equalityFieldIds; + if (numEqualityFields > 0) { + equalityFieldIds = Sets.newHashSetWithExpectedSize(numEqualityFields); + } else { + equalityFieldIds = Collections.emptySet(); + } + + for (int i = 0; i < numEqualityFields; i++) { + equalityFieldIds.add(dataInputView.readInt()); + } + + return new DynamicRecordInternal( + tableName, branch, schema, rowData, spec, writerKey, upsertMode, equalityFieldIds); + } + + @Override + public DynamicRecordInternal deserialize(DynamicRecordInternal reuse, DataInputView dataInputView) + throws IOException { + String tableName = dataInputView.readUTF(); + reuse.setTableName(tableName); + String branch = dataInputView.readUTF(); + reuse.setBranch(branch); + + final Schema schema; + final PartitionSpec spec; + final RowDataSerializer rowDataSerializer; + if (writeSchemaAndSpec) { + schema = SchemaParser.fromJson(dataInputView.readUTF()); + spec = PartitionSpecParser.fromJson(schema, dataInputView.readUTF()); + reuse.setSchema(schema); + reuse.setSpec(spec); + rowDataSerializer = serializerCache.serializer(tableName, schema, spec); + } else { + Integer schemaId = dataInputView.readInt(); + Integer specId = dataInputView.readInt(); + Tuple3 serializerWithSchemaAndSpec = + serializerCache.serializerWithSchemaAndSpec(tableName, schemaId, specId); + schema = serializerWithSchemaAndSpec.f1; + spec = serializerWithSchemaAndSpec.f2; + rowDataSerializer = serializerWithSchemaAndSpec.f0; + } + + int writerKey = dataInputView.readInt(); + reuse.setWriterKey(writerKey); + RowData rowData = rowDataSerializer.deserialize(dataInputView); + boolean upsertMode = dataInputView.readBoolean(); + int numEqualityFields = dataInputView.readInt(); + final Set equalityFieldIds; + if (numEqualityFields > 0) { + equalityFieldIds = Sets.newHashSetWithExpectedSize(numEqualityFields); + } else { + equalityFieldIds = Collections.emptySet(); + } + for (int i = 0; i < numEqualityFields; i++) { + equalityFieldIds.add(dataInputView.readInt()); + } + return new DynamicRecordInternal( + tableName, branch, schema, rowData, spec, writerKey, upsertMode, equalityFieldIds); + } + + @Override + public DynamicRecordInternal copy(DynamicRecordInternal from) { + return new DynamicRecordInternal( + from.tableName(), + from.branch(), + from.schema(), + from.rowData(), + from.spec(), + from.writerKey(), + from.upsertMode(), + from.equalityFields()); + } + + @Override + public DynamicRecordInternal copy(DynamicRecordInternal from, DynamicRecordInternal reuse) { + reuse.setTableName(from.tableName()); + reuse.setBranch(from.branch()); + reuse.setSchema(from.schema()); + reuse.setSpec(from.spec()); + reuse.setWriterKey(from.writerKey()); + reuse.setRowData(from.rowData()); + reuse.setUpsertMode(from.upsertMode()); + reuse.setEqualityFieldIds(from.equalityFields()); + return reuse; + } + + @Override + public void copy(DataInputView source, DataOutputView target) throws IOException { + serialize(deserialize(source), target); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj instanceof DynamicRecordInternalSerializer) { + DynamicRecordInternalSerializer other = (DynamicRecordInternalSerializer) obj; + return writeSchemaAndSpec == other.writeSchemaAndSpec; + } + return false; + } + + @Override + public int hashCode() { + return Boolean.hashCode(writeSchemaAndSpec); + } + + @Override + public boolean isImmutableType() { + return false; + } + + @Override + public int getLength() { + return -1; + } + + @Override + public TypeSerializerSnapshot snapshotConfiguration() { + return new DynamicRecordInternalTypeSerializerSnapshot(writeSchemaAndSpec); + } + + public static class DynamicRecordInternalTypeSerializerSnapshot + implements TypeSerializerSnapshot { + + private boolean writeSchemaAndSpec; + + // Zero args constructor is required to instantiate this class on restore + @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) + public DynamicRecordInternalTypeSerializerSnapshot() {} + + DynamicRecordInternalTypeSerializerSnapshot(boolean writeSchemaAndSpec) { + this.writeSchemaAndSpec = writeSchemaAndSpec; + } + + @Override + public int getCurrentVersion() { + return 0; + } + + @Override + public void writeSnapshot(DataOutputView out) throws IOException { + out.writeBoolean(writeSchemaAndSpec); + } + + @Override + public void readSnapshot(int readVersion, DataInputView in, ClassLoader userCodeClassLoader) + throws IOException { + this.writeSchemaAndSpec = in.readBoolean(); + } + + @Override + public TypeSerializerSchemaCompatibility resolveSchemaCompatibility( + TypeSerializerSnapshot oldSerializerSnapshot) { + return TypeSerializerSchemaCompatibility.compatibleAsIs(); + } + + @Override + public TypeSerializer restoreSerializer() { + // Note: We pass in a null serializer cache which would create issues if we tried to use this + // restored serializer, but since we are using {@code + // TypeSerializerSchemaCompatibility.compatibleAsIs()} above, this serializer will never be + // used. A new one will be created via {@code DynamicRecordInternalType}. + return new DynamicRecordInternalSerializer(null, writeSchemaAndSpec); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalType.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalType.java new file mode 100644 index 000000000000..c18c8f670daf --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalType.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.serialization.SerializerConfig; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.iceberg.flink.CatalogLoader; + +@Internal +class DynamicRecordInternalType extends TypeInformation { + + private final CatalogLoader catalogLoader; + private final boolean writeSchemaAndSpec; + private final int cacheSize; + + DynamicRecordInternalType( + CatalogLoader catalogLoader, boolean writeSchemaAndSpec, int cacheSize) { + this.catalogLoader = catalogLoader; + this.writeSchemaAndSpec = writeSchemaAndSpec; + this.cacheSize = cacheSize; + } + + @Override + public boolean isBasicType() { + return false; + } + + @Override + public boolean isTupleType() { + return false; + } + + @Override + public int getArity() { + return 0; + } + + @Override + public int getTotalFields() { + return 1; + } + + @Override + public Class getTypeClass() { + return DynamicRecordInternal.class; + } + + @Override + public boolean isKeyType() { + return false; + } + + @Override + public TypeSerializer createSerializer(SerializerConfig serializerConfig) { + return new DynamicRecordInternalSerializer( + new TableSerializerCache(catalogLoader, cacheSize), writeSchemaAndSpec); + } + + @Override + public String toString() { + return getClass().getName(); + } + + @Override + public boolean equals(Object o) { + return canEqual(o); + } + + @Override + public int hashCode() { + return getClass().getName().hashCode(); + } + + @Override + public boolean canEqual(Object o) { + return o instanceof DynamicRecordInternalType; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java new file mode 100644 index 000000000000..166217a0140e --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.streaming.api.functions.ProcessFunction; +import org.apache.flink.table.data.RowData; +import org.apache.flink.util.Collector; +import org.apache.flink.util.OutputTag; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.flink.CatalogLoader; + +@Internal +class DynamicRecordProcessor extends ProcessFunction + implements Collector { + @VisibleForTesting + static final String DYNAMIC_TABLE_UPDATE_STREAM = "dynamic-table-update-stream"; + + private final DynamicRecordGenerator generator; + private final CatalogLoader catalogLoader; + private final boolean immediateUpdate; + private final int cacheMaximumSize; + private final long cacheRefreshMs; + private final int inputSchemasPerTableCacheMaximumSize; + + private transient TableMetadataCache tableCache; + private transient HashKeyGenerator hashKeyGenerator; + private transient TableUpdater updater; + private transient OutputTag updateStream; + private transient Collector collector; + private transient Context context; + + DynamicRecordProcessor( + DynamicRecordGenerator generator, + CatalogLoader catalogLoader, + boolean immediateUpdate, + int cacheMaximumSize, + long cacheRefreshMs, + int inputSchemasPerTableCacheMaximumSize) { + this.generator = generator; + this.catalogLoader = catalogLoader; + this.immediateUpdate = immediateUpdate; + this.cacheMaximumSize = cacheMaximumSize; + this.cacheRefreshMs = cacheRefreshMs; + this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaximumSize; + } + + @Override + public void open(OpenContext openContext) throws Exception { + super.open(openContext); + Catalog catalog = catalogLoader.loadCatalog(); + this.tableCache = + new TableMetadataCache( + catalog, cacheMaximumSize, cacheRefreshMs, inputSchemasPerTableCacheMaximumSize); + this.hashKeyGenerator = + new HashKeyGenerator( + cacheMaximumSize, getRuntimeContext().getTaskInfo().getMaxNumberOfParallelSubtasks()); + if (immediateUpdate) { + updater = new TableUpdater(tableCache, catalog); + } else { + updateStream = + new OutputTag<>( + DYNAMIC_TABLE_UPDATE_STREAM, + new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize)) {}; + } + + generator.open(openContext); + } + + @Override + public void processElement(T element, Context ctx, Collector out) + throws Exception { + this.context = ctx; + this.collector = out; + generator.generate(element, this); + } + + @Override + public void collect(DynamicRecord data) { + boolean exists = tableCache.exists(data.tableIdentifier()).f0; + String foundBranch = exists ? tableCache.branch(data.tableIdentifier(), data.branch()) : null; + + TableMetadataCache.ResolvedSchemaInfo foundSchema = + exists + ? tableCache.schema(data.tableIdentifier(), data.schema()) + : TableMetadataCache.NOT_FOUND; + + PartitionSpec foundSpec = exists ? tableCache.spec(data.tableIdentifier(), data.spec()) : null; + + if (!exists + || foundBranch == null + || foundSpec == null + || foundSchema.compareResult() == CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED) { + if (immediateUpdate) { + Tuple2 newData = + updater.update(data.tableIdentifier(), data.branch(), data.schema(), data.spec()); + emit( + collector, + data, + newData.f0.resolvedTableSchema(), + newData.f0.recordConverter(), + newData.f1); + } else { + int writerKey = + hashKeyGenerator.generateKey( + data, + foundSchema.resolvedTableSchema() != null + ? foundSchema.resolvedTableSchema() + : data.schema(), + foundSpec != null ? foundSpec : data.spec(), + data.rowData()); + context.output( + updateStream, + new DynamicRecordInternal( + data.tableIdentifier().toString(), + data.branch(), + data.schema(), + data.rowData(), + data.spec(), + writerKey, + data.upsertMode(), + DynamicSinkUtil.getEqualityFieldIds(data.equalityFields(), data.schema()))); + } + } else { + emit( + collector, + data, + foundSchema.resolvedTableSchema(), + foundSchema.recordConverter(), + foundSpec); + } + } + + private void emit( + Collector out, + DynamicRecord data, + Schema schema, + DataConverter recordConverter, + PartitionSpec spec) { + RowData rowData = (RowData) recordConverter.convert(data.rowData()); + int writerKey = hashKeyGenerator.generateKey(data, schema, spec, rowData); + String tableName = data.tableIdentifier().toString(); + out.collect( + new DynamicRecordInternal( + tableName, + data.branch(), + schema, + rowData, + spec, + writerKey, + data.upsertMode(), + DynamicSinkUtil.getEqualityFieldIds(data.equalityFields(), schema))); + } + + @Override + public void close() { + try { + super.close(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicSinkUtil.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicSinkUtil.java new file mode 100644 index 000000000000..6ea6dcab867a --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicSinkUtil.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.Collections; +import java.util.Set; +import org.apache.hadoop.util.Sets; +import org.apache.iceberg.Schema; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.types.Types; + +class DynamicSinkUtil { + + private DynamicSinkUtil() {} + + static Set getEqualityFieldIds(Set equalityFields, Schema schema) { + if (equalityFields == null || equalityFields.isEmpty()) { + if (!schema.identifierFieldIds().isEmpty()) { + return schema.identifierFieldIds(); + } else { + return Collections.emptySet(); + } + } + + Set equalityFieldIds = Sets.newHashSetWithExpectedSize(equalityFields.size()); + for (String equalityField : equalityFields) { + Types.NestedField field = schema.findField(equalityField); + Preconditions.checkNotNull( + field, "Equality field %s does not exist in schema", equalityField); + equalityFieldIds.add(field.fieldId()); + } + + return equalityFieldIds; + } + + static int safeAbs(int input) { + if (input >= 0) { + return input; + } + + if (input == Integer.MIN_VALUE) { + // -Integer.MIN_VALUE would be Integer.MIN_VALUE due to integer overflow. Map to + // Integer.MAX_VALUE instead! + return Integer.MAX_VALUE; + } + + return -input; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java new file mode 100644 index 000000000000..6057d773c3f0 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.api.common.functions.RichMapFunction; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.CatalogLoader; + +/** + * An optional operator to perform table updates for tables (e.g. schema update) in a non-concurrent + * way. Records must be keyed / routed to this operator by table name to ensure non-concurrent + * updates. The operator itself forwards the record after updating schema / spec of the table. The + * update is also reflected in the record. + */ +@Internal +class DynamicTableUpdateOperator + extends RichMapFunction { + private final CatalogLoader catalogLoader; + private final int cacheMaximumSize; + private final long cacheRefreshMs; + private final int inputSchemasPerTableCacheMaximumSize; + + private transient TableUpdater updater; + + DynamicTableUpdateOperator( + CatalogLoader catalogLoader, + int cacheMaximumSize, + long cacheRefreshMs, + int inputSchemasPerTableCacheMaximumSize) { + this.catalogLoader = catalogLoader; + this.cacheMaximumSize = cacheMaximumSize; + this.cacheRefreshMs = cacheRefreshMs; + this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaximumSize; + } + + @Override + public void open(OpenContext openContext) throws Exception { + super.open(openContext); + Catalog catalog = catalogLoader.loadCatalog(); + this.updater = + new TableUpdater( + new TableMetadataCache( + catalog, cacheMaximumSize, cacheRefreshMs, inputSchemasPerTableCacheMaximumSize), + catalog); + } + + @Override + public DynamicRecordInternal map(DynamicRecordInternal data) throws Exception { + Tuple2 newData = + updater.update( + TableIdentifier.parse(data.tableName()), data.branch(), data.schema(), data.spec()); + TableMetadataCache.ResolvedSchemaInfo compareInfo = newData.f0; + + data.setSchema(compareInfo.resolvedTableSchema()); + data.setSpec(newData.f1); + + RowData newRowData = (RowData) newData.f0.recordConverter().convert(data.rowData()); + data.setRowData(newRowData); + + return data; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResult.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResult.java new file mode 100644 index 000000000000..85806f932ad5 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResult.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import org.apache.iceberg.io.WriteResult; + +class DynamicWriteResult { + + private final WriteTarget key; + private final WriteResult writeResult; + + DynamicWriteResult(WriteTarget key, WriteResult writeResult) { + this.key = key; + this.writeResult = writeResult; + } + + WriteTarget key() { + return key; + } + + WriteResult writeResult() { + return writeResult; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultAggregator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultAggregator.java new file mode 100644 index 000000000000..58ba183dfcd4 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultAggregator.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; +import java.io.IOException; +import java.time.Duration; +import java.util.Collection; +import java.util.Map; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.core.io.SimpleVersionedSerialization; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; +import org.apache.flink.streaming.api.connector.sink2.CommittableSummary; +import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.CatalogLoader; +import org.apache.iceberg.flink.sink.DeltaManifests; +import org.apache.iceberg.flink.sink.DeltaManifestsSerializer; +import org.apache.iceberg.flink.sink.FlinkManifestUtil; +import org.apache.iceberg.flink.sink.ManifestOutputFileFactory; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Operator which aggregates the individual {@link WriteResult} objects to a single {@link + * DynamicCommittable} per checkpoint (storing the serialized {@link DeltaManifests}, jobId, + * operatorId, checkpointId) + */ +class DynamicWriteResultAggregator + extends AbstractStreamOperator> + implements OneInputStreamOperator< + CommittableMessage, CommittableMessage> { + private static final Logger LOG = LoggerFactory.getLogger(DynamicWriteResultAggregator.class); + private static final byte[] EMPTY_MANIFEST_DATA = new byte[0]; + private static final Duration CACHE_EXPIRATION_DURATION = Duration.ofMinutes(1); + + private final CatalogLoader catalogLoader; + private transient Map> results; + private transient Cache> specs; + private transient Cache outputFileFactories; + private transient String flinkJobId; + private transient String operatorId; + private transient int subTaskId; + private transient int attemptId; + private transient Catalog catalog; + + DynamicWriteResultAggregator(CatalogLoader catalogLoader) { + this.catalogLoader = catalogLoader; + } + + @Override + public void open() throws Exception { + this.flinkJobId = getContainingTask().getEnvironment().getJobID().toString(); + this.operatorId = getOperatorID().toString(); + this.subTaskId = getRuntimeContext().getTaskInfo().getIndexOfThisSubtask(); + this.attemptId = getRuntimeContext().getTaskInfo().getAttemptNumber(); + this.results = Maps.newHashMap(); + this.specs = + Caffeine.newBuilder().expireAfterWrite(CACHE_EXPIRATION_DURATION).softValues().build(); + this.outputFileFactories = + Caffeine.newBuilder().expireAfterWrite(CACHE_EXPIRATION_DURATION).softValues().build(); + this.catalog = catalogLoader.loadCatalog(); + } + + @Override + public void finish() throws IOException { + prepareSnapshotPreBarrier(Long.MAX_VALUE); + } + + @Override + public void prepareSnapshotPreBarrier(long checkpointId) throws IOException { + Collection> committables = + Sets.newHashSetWithExpectedSize(results.size()); + int count = 0; + for (Map.Entry> entries : results.entrySet()) { + committables.add( + new CommittableWithLineage<>( + new DynamicCommittable( + entries.getKey(), + writeToManifest(entries.getKey(), entries.getValue(), checkpointId), + getContainingTask().getEnvironment().getJobID().toString(), + getRuntimeContext().getOperatorUniqueID(), + checkpointId), + checkpointId, + count)); + ++count; + } + + output.collect( + new StreamRecord<>( + new CommittableSummary<>(subTaskId, count, checkpointId, count, count, 0))); + committables.forEach( + c -> + output.collect( + new StreamRecord<>( + new CommittableWithLineage<>(c.getCommittable(), checkpointId, subTaskId)))); + LOG.info("Emitted {} commit message to downstream committer operator", count); + results.clear(); + } + + /** + * Write all the completed data files to a newly created manifest file and return the manifest's + * avro serialized bytes. + */ + @VisibleForTesting + byte[] writeToManifest( + WriteTarget key, Collection writeResults, long checkpointId) + throws IOException { + if (writeResults.isEmpty()) { + return EMPTY_MANIFEST_DATA; + } + + WriteResult.Builder builder = WriteResult.builder(); + writeResults.forEach(w -> builder.add(w.writeResult())); + WriteResult result = builder.build(); + + DeltaManifests deltaManifests = + FlinkManifestUtil.writeCompletedFiles( + result, + () -> outputFileFactory(key.tableName()).create(checkpointId), + spec(key.tableName(), key.specId())); + + return SimpleVersionedSerialization.writeVersionAndSerialize( + DeltaManifestsSerializer.INSTANCE, deltaManifests); + } + + @Override + public void processElement(StreamRecord> element) + throws Exception { + + if (element.isRecord() && element.getValue() instanceof CommittableWithLineage) { + DynamicWriteResult result = + ((CommittableWithLineage) element.getValue()).getCommittable(); + WriteTarget key = result.key(); + results.computeIfAbsent(key, unused -> Sets.newHashSet()).add(result); + } + } + + private ManifestOutputFileFactory outputFileFactory(String tableName) { + return outputFileFactories.get( + tableName, + unused -> { + Table table = catalog.loadTable(TableIdentifier.parse(tableName)); + specs.put(tableName, table.specs()); + return FlinkManifestUtil.createOutputFileFactory( + () -> table, table.properties(), flinkJobId, operatorId, subTaskId, attemptId); + }); + } + + private PartitionSpec spec(String tableName, int specId) { + Map knownSpecs = specs.getIfPresent(tableName); + if (knownSpecs != null) { + PartitionSpec spec = knownSpecs.get(specId); + if (spec != null) { + return spec; + } + } + + Table table = catalog.loadTable(TableIdentifier.parse(tableName)); + return table.specs().get(specId); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultSerializer.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultSerializer.java new file mode 100644 index 000000000000..cf5f423fd7ff --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultSerializer.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputViewStreamWrapper; +import org.apache.iceberg.flink.sink.WriteResultSerializer; +import org.apache.iceberg.io.WriteResult; + +class DynamicWriteResultSerializer implements SimpleVersionedSerializer { + + private static final int VERSION = 1; + private static final WriteResultSerializer WRITE_RESULT_SERIALIZER = new WriteResultSerializer(); + + @Override + public int getVersion() { + return VERSION; + } + + @Override + public byte[] serialize(DynamicWriteResult writeResult) throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + DataOutputViewStreamWrapper view = new DataOutputViewStreamWrapper(out); + writeResult.key().serializeTo(view); + byte[] result = WRITE_RESULT_SERIALIZER.serialize(writeResult.writeResult()); + view.write(result); + return out.toByteArray(); + } + + @Override + public DynamicWriteResult deserialize(int version, byte[] serialized) throws IOException { + if (version == 1) { + DataInputDeserializer view = new DataInputDeserializer(serialized); + WriteTarget key = WriteTarget.deserializeFrom(view); + byte[] resultBuf = new byte[view.available()]; + view.read(resultBuf); + WriteResult writeResult = WRITE_RESULT_SERIALIZER.deserialize(version, resultBuf); + return new DynamicWriteResult(key, writeResult); + } + + throw new IOException("Unrecognized version or corrupt state: " + version); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriter.java new file mode 100644 index 000000000000..ae24efafa6af --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriter.java @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.connector.sink2.CommittingSinkWriter; +import org.apache.flink.api.connector.sink2.SinkWriter; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.sink.RowDataTaskWriterFactory; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Iceberg writer implementation for the {@link SinkWriter} interface. Used by the + * DynamicIcebergSink. Writes out the data to the final place, and emits {@link DynamicWriteResult} + * for every unique {@link WriteTarget} at checkpoint time. + */ +class DynamicWriter implements CommittingSinkWriter { + + private static final Logger LOG = LoggerFactory.getLogger(DynamicWriter.class); + + private final Map taskWriterFactories; + private final Map> writers; + private final DynamicWriterMetrics metrics; + private final int subTaskId; + private final int attemptId; + private final Catalog catalog; + private final FileFormat dataFileFormat; + private final long targetDataFileSize; + private final Map commonWriteProperties; + + DynamicWriter( + Catalog catalog, + FileFormat dataFileFormat, + long targetDataFileSize, + Map commonWriteProperties, + int cacheMaximumSize, + DynamicWriterMetrics metrics, + int subTaskId, + int attemptId) { + this.catalog = catalog; + this.dataFileFormat = dataFileFormat; + this.targetDataFileSize = targetDataFileSize; + this.commonWriteProperties = commonWriteProperties; + this.metrics = metrics; + this.subTaskId = subTaskId; + this.attemptId = attemptId; + this.taskWriterFactories = new LRUCache<>(cacheMaximumSize); + this.writers = Maps.newHashMap(); + + LOG.debug("DynamicIcebergSinkWriter created for subtask {} attemptId {}", subTaskId, attemptId); + } + + @Override + public void write(DynamicRecordInternal element, Context context) + throws IOException, InterruptedException { + writers + .computeIfAbsent( + new WriteTarget( + element.tableName(), + element.branch(), + element.schema().schemaId(), + element.spec().specId(), + element.upsertMode(), + element.equalityFields()), + writerKey -> { + RowDataTaskWriterFactory taskWriterFactory = + taskWriterFactories.computeIfAbsent( + writerKey, + factoryKey -> { + Table table = + catalog.loadTable(TableIdentifier.parse(factoryKey.tableName())); + + Map tableWriteProperties = + Maps.newHashMap(table.properties()); + tableWriteProperties.putAll(commonWriteProperties); + + Set equalityFieldIds = + getEqualityFields(table, element.equalityFields()); + if (element.upsertMode()) { + Preconditions.checkState( + !equalityFieldIds.isEmpty(), + "Equality field columns shouldn't be empty when configuring to use UPSERT data."); + if (!table.spec().isUnpartitioned()) { + for (PartitionField partitionField : table.spec().fields()) { + Preconditions.checkState( + equalityFieldIds.contains(partitionField.sourceId()), + "In UPSERT mode, partition field '%s' should be included in equality fields: '%s'", + partitionField, + equalityFieldIds); + } + } + } + + LOG.debug("Creating new writer factory for table '{}'", table.name()); + return new RowDataTaskWriterFactory( + () -> table, + FlinkSchemaUtil.convert(element.schema()), + targetDataFileSize, + dataFileFormat, + tableWriteProperties, + Lists.newArrayList(equalityFieldIds), + element.upsertMode(), + element.schema(), + element.spec()); + }); + + taskWriterFactory.initialize(subTaskId, attemptId); + return taskWriterFactory.create(); + }) + .write(element.rowData()); + } + + @Override + public void flush(boolean endOfInput) { + // flush is used to handle flush/endOfInput, so no action is taken here. + } + + @Override + public void close() throws Exception { + for (TaskWriter writer : writers.values()) { + writer.close(); + } + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("subtaskId", subTaskId) + .add("attemptId", attemptId) + .add("dataFileFormat", dataFileFormat) + .add("targetDataFileSize", targetDataFileSize) + .add("writeProperties", commonWriteProperties) + .toString(); + } + + @Override + public Collection prepareCommit() throws IOException { + List result = Lists.newArrayList(); + for (Map.Entry> entry : writers.entrySet()) { + long startNano = System.nanoTime(); + WriteResult writeResult = entry.getValue().complete(); + WriteTarget writeTarget = entry.getKey(); + metrics.updateFlushResult(writeTarget.tableName(), writeResult); + metrics.flushDuration( + writeTarget.tableName(), TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano)); + LOG.debug( + "Iceberg writer for table {} subtask {} attempt {} flushed {} data files and {} delete files", + writeTarget.tableName(), + subTaskId, + attemptId, + writeResult.dataFiles().length, + writeResult.deleteFiles().length); + + result.add(new DynamicWriteResult(writeTarget, writeResult)); + } + + writers.clear(); + + return result; + } + + private static Set getEqualityFields(Table table, Set equalityFieldIds) { + if (equalityFieldIds != null && !equalityFieldIds.isEmpty()) { + return equalityFieldIds; + } + Set identifierFieldIds = table.schema().identifierFieldIds(); + if (identifierFieldIds != null && !identifierFieldIds.isEmpty()) { + return identifierFieldIds; + } + return Collections.emptySet(); + } + + @VisibleForTesting + DynamicWriterMetrics getMetrics() { + return metrics; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriterMetrics.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriterMetrics.java new file mode 100644 index 000000000000..2e1f82df9d2d --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriterMetrics.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.Map; +import org.apache.flink.metrics.MetricGroup; +import org.apache.iceberg.flink.sink.IcebergStreamWriterMetrics; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +class DynamicWriterMetrics { + + private final Map metrics; + private final MetricGroup mainMetricsGroup; + + DynamicWriterMetrics(MetricGroup mainMetricsGroup) { + this.mainMetricsGroup = mainMetricsGroup; + this.metrics = Maps.newHashMap(); + } + + public void updateFlushResult(String fullTableName, WriteResult result) { + writerMetrics(fullTableName).updateFlushResult(result); + } + + public void flushDuration(String fullTableName, long flushDurationMs) { + writerMetrics(fullTableName).flushDuration(flushDurationMs); + } + + IcebergStreamWriterMetrics writerMetrics(String fullTableName) { + return metrics.computeIfAbsent( + fullTableName, tableName -> new IcebergStreamWriterMetrics(mainMetricsGroup, tableName)); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/EvolveSchemaVisitor.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/EvolveSchemaVisitor.java new file mode 100644 index 000000000000..ee0549997178 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/EvolveSchemaVisitor.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.List; +import org.apache.iceberg.Schema; +import org.apache.iceberg.UpdateSchema; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.schema.SchemaWithPartnerVisitor; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; + +/** + * Visitor class that accumulates the set of changes needed to evolve an existing schema into the + * target schema. Changes are applied to an {@link UpdateSchema} operation. + * + *

We support: + * + *

    + *
  • Adding new columns + *
  • Widening the type of existing columsn + *
  • Reordering columns + *
+ * + * We don't support: + * + *
    + *
  • Dropping columns + *
  • Renaming columns + *
+ * + * The reason is that dropping columns would create issues with late / out of order data. Once we + * drop fields, we wouldn't be able to easily add them back later without losing the associated + * data. Renaming columns is not supported because we compare schemas by name, which doesn't allow + * for renaming without additional hints. + */ +public class EvolveSchemaVisitor extends SchemaWithPartnerVisitor { + + private final UpdateSchema api; + private final Schema existingSchema; + private final Schema targetSchema; + + private EvolveSchemaVisitor(UpdateSchema api, Schema existingSchema, Schema targetSchema) { + this.api = api; + this.existingSchema = existingSchema; + this.targetSchema = targetSchema; + } + + /** + * Adds changes needed to produce the target schema to an {@link UpdateSchema} operation. + * + *

Changes are accumulated to evolve the existingSchema into a targetSchema. + * + * @param api an UpdateSchema for adding changes + * @param existingSchema an existing schema + * @param targetSchema a new schema to compare with the existing + */ + public static void visit(UpdateSchema api, Schema existingSchema, Schema targetSchema) { + visit( + targetSchema, + -1, + new EvolveSchemaVisitor(api, existingSchema, targetSchema), + new CompareSchemasVisitor.PartnerIdByNameAccessors(existingSchema)); + } + + @Override + public Boolean struct(Types.StructType struct, Integer partnerId, List existingFields) { + if (partnerId == null) { + return true; + } + + // Add, update and order fields in the struct + Types.StructType partnerStruct = findFieldType(partnerId).asStructType(); + String after = null; + for (Types.NestedField targetField : struct.fields()) { + Types.NestedField nestedField = partnerStruct.field(targetField.name()); + final String columnName; + if (nestedField != null) { + updateColumn(nestedField, targetField); + columnName = this.existingSchema.findColumnName(nestedField.fieldId()); + } else { + addColumn(partnerId, targetField); + columnName = this.targetSchema.findColumnName(targetField.fieldId()); + } + + setPosition(columnName, after); + after = columnName; + } + + // Ensure that unused fields are made optional + for (Types.NestedField existingField : partnerStruct.fields()) { + if (struct.field(existingField.name()) == null) { + if (existingField.isRequired()) { + this.api.makeColumnOptional(this.existingSchema.findColumnName(existingField.fieldId())); + } + } + } + + return false; + } + + @Override + public Boolean field(Types.NestedField field, Integer partnerId, Boolean isFieldMissing) { + return partnerId == null; + } + + @Override + public Boolean list(Types.ListType list, Integer partnerId, Boolean isElementMissing) { + if (partnerId == null) { + return true; + } + + Preconditions.checkState( + !isElementMissing, "Error traversing schemas: element is missing, but list is present"); + + Types.ListType partnerList = findFieldType(partnerId).asListType(); + updateColumn(partnerList.fields().get(0), list.fields().get(0)); + + return false; + } + + @Override + public Boolean map( + Types.MapType map, Integer partnerId, Boolean isKeyMissing, Boolean isValueMissing) { + if (partnerId == null) { + return true; + } + + Preconditions.checkState( + !isKeyMissing, "Error traversing schemas: key is missing, but map is present"); + Preconditions.checkState( + !isValueMissing, "Error traversing schemas: value is missing, but map is present"); + + Types.MapType partnerMap = findFieldType(partnerId).asMapType(); + updateColumn(partnerMap.fields().get(0), map.fields().get(0)); + updateColumn(partnerMap.fields().get(1), map.fields().get(1)); + + return false; + } + + @Override + public Boolean primitive(Type.PrimitiveType primitive, Integer partnerId) { + return partnerId == null; + } + + private Type findFieldType(int fieldId) { + if (fieldId == -1) { + return existingSchema.asStruct(); + } else { + return existingSchema.findField(fieldId).type(); + } + } + + private void addColumn(int parentId, Types.NestedField field) { + String parentName = existingSchema.findColumnName(parentId); + api.addColumn(parentName, field.name(), field.type(), field.doc()); + } + + private void updateColumn(Types.NestedField existingField, Types.NestedField targetField) { + String existingColumnName = this.existingSchema.findColumnName(existingField.fieldId()); + + boolean needsOptionalUpdate = targetField.isOptional() && existingField.isRequired(); + boolean needsTypeUpdate = + targetField.type().isPrimitiveType() && !targetField.type().equals(existingField.type()); + boolean needsDocUpdate = + targetField.doc() != null && !targetField.doc().equals(existingField.doc()); + + if (needsOptionalUpdate) { + api.makeColumnOptional(existingColumnName); + } + + if (needsTypeUpdate) { + api.updateColumn(existingColumnName, targetField.type().asPrimitiveType()); + } + + if (needsDocUpdate) { + api.updateColumnDoc(existingColumnName, targetField.doc()); + } + } + + private void setPosition(String columnName, String after) { + if (after == null) { + this.api.moveFirst(columnName); + } else { + this.api.moveAfter(columnName, after); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java new file mode 100644 index 000000000000..91aa4a91710c --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java @@ -0,0 +1,382 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; + +import java.util.Collections; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; +import javax.annotation.Nullable; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.runtime.state.KeyGroupRangeAssignment; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.sink.EqualityFieldKeySelector; +import org.apache.iceberg.flink.sink.PartitionKeySelector; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * The HashKeyGenerator is responsible for creating the appropriate hash key for Flink's keyBy + * operation. The hash key is generated depending on the user-provided DynamicRecord and the table + * metadata. Under the hood, we maintain a set of Flink {@link KeySelector}s which implement the + * appropriate Iceberg {@link DistributionMode}. For every table, we randomly select a consistent + * subset of writer subtasks which receive data via their associated keys, depending on the chosen + * DistributionMode. + * + *

Caching ensures that a new key selector is also created when the table metadata (e.g. schema, + * spec) or the user-provided metadata changes (e.g. distribution mode, write parallelism). + * + *

Note: The hashing must be deterministic given the same parameters of the KeySelector and the + * same provided values. + */ +class HashKeyGenerator { + private static final Logger LOG = LoggerFactory.getLogger(HashKeyGenerator.class); + + private final int maxWriteParallelism; + private final Map> keySelectorCache; + + HashKeyGenerator(int maxCacheSize, int maxWriteParallelism) { + this.maxWriteParallelism = maxWriteParallelism; + this.keySelectorCache = new LRUCache<>(maxCacheSize); + } + + int generateKey(DynamicRecord dynamicRecord) throws Exception { + return generateKey(dynamicRecord, null, null, null); + } + + int generateKey( + DynamicRecord dynamicRecord, + @Nullable Schema tableSchema, + @Nullable PartitionSpec tableSpec, + @Nullable RowData overrideRowData) { + String tableIdent = dynamicRecord.tableIdentifier().toString(); + SelectorKey cacheKey = + new SelectorKey( + tableIdent, + dynamicRecord.branch(), + tableSchema != null ? tableSchema.schemaId() : null, + tableSpec != null ? tableSpec.specId() : null, + dynamicRecord.schema(), + dynamicRecord.spec(), + dynamicRecord.equalityFields()); + KeySelector keySelector = + keySelectorCache.computeIfAbsent( + cacheKey, + k -> + getKeySelector( + tableIdent, + MoreObjects.firstNonNull(tableSchema, dynamicRecord.schema()), + MoreObjects.firstNonNull(tableSpec, dynamicRecord.spec()), + MoreObjects.firstNonNull( + dynamicRecord.distributionMode(), DistributionMode.NONE), + MoreObjects.firstNonNull( + dynamicRecord.equalityFields(), Collections.emptySet()), + dynamicRecord.writeParallelism())); + try { + return keySelector.getKey( + overrideRowData != null ? overrideRowData : dynamicRecord.rowData()); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + private KeySelector getKeySelector( + String tableName, + Schema schema, + PartitionSpec spec, + DistributionMode mode, + Set equalityFields, + int writeParallelism) { + LOG.debug( + "Creating new KeySelector for table '{}' with distribution mode '{}'", tableName, mode); + switch (mode) { + case NONE: + if (equalityFields.isEmpty()) { + return tableKeySelector(tableName, writeParallelism, maxWriteParallelism); + } else { + LOG.info( + "{}: Distribute rows by equality fields, because there are equality fields set", + tableName); + return equalityFieldKeySelector( + tableName, schema, equalityFields, writeParallelism, maxWriteParallelism); + } + + case HASH: + if (equalityFields.isEmpty()) { + if (spec.isUnpartitioned()) { + LOG.warn( + "{}: Fallback to use 'none' distribution mode, because there are no equality fields set " + + "and table is unpartitioned", + tableName); + return tableKeySelector(tableName, writeParallelism, maxWriteParallelism); + } else { + return partitionKeySelector( + tableName, schema, spec, writeParallelism, maxWriteParallelism); + } + } else { + if (spec.isUnpartitioned()) { + LOG.info( + "{}: Distribute rows by equality fields, because there are equality fields set " + + "and table is unpartitioned", + tableName); + return equalityFieldKeySelector( + tableName, schema, equalityFields, writeParallelism, maxWriteParallelism); + } else { + for (PartitionField partitionField : spec.fields()) { + Preconditions.checkState( + equalityFields.contains(partitionField.name()), + "%s: In 'hash' distribution mode with equality fields set, partition field '%s' " + + "should be included in equality fields: '%s'", + tableName, + partitionField, + schema.columns().stream() + .filter(c -> equalityFields.contains(c.name())) + .collect(Collectors.toList())); + } + return partitionKeySelector( + tableName, schema, spec, writeParallelism, maxWriteParallelism); + } + } + + case RANGE: + if (schema.identifierFieldIds().isEmpty()) { + LOG.warn( + "{}: Fallback to use 'none' distribution mode, because there are no equality fields set " + + "and {}='range' is not supported yet in flink", + tableName, + WRITE_DISTRIBUTION_MODE); + return tableKeySelector(tableName, writeParallelism, maxWriteParallelism); + } else { + LOG.info( + "{}: Distribute rows by equality fields, because there are equality fields set " + + "and {}='range' is not supported yet in flink", + tableName, + WRITE_DISTRIBUTION_MODE); + return equalityFieldKeySelector( + tableName, schema, equalityFields, writeParallelism, maxWriteParallelism); + } + + default: + throw new IllegalArgumentException( + tableName + ": Unrecognized " + WRITE_DISTRIBUTION_MODE + ": " + mode); + } + } + + private static KeySelector equalityFieldKeySelector( + String tableName, + Schema schema, + Set equalityFields, + int writeParallelism, + int maxWriteParallelism) { + return new TargetLimitedKeySelector( + new EqualityFieldKeySelector( + schema, + FlinkSchemaUtil.convert(schema), + DynamicSinkUtil.getEqualityFieldIds(equalityFields, schema)), + tableName, + writeParallelism, + maxWriteParallelism); + } + + private static KeySelector partitionKeySelector( + String tableName, + Schema schema, + PartitionSpec spec, + int writeParallelism, + int maxWriteParallelism) { + KeySelector inner = + new PartitionKeySelector(spec, schema, FlinkSchemaUtil.convert(schema)); + return new TargetLimitedKeySelector( + in -> inner.getKey(in).hashCode(), tableName, writeParallelism, maxWriteParallelism); + } + + private static KeySelector tableKeySelector( + String tableName, int writeParallelism, int maxWriteParallelism) { + return new TargetLimitedKeySelector( + new RoundRobinKeySelector<>(writeParallelism), + tableName, + writeParallelism, + maxWriteParallelism); + } + + /** + * Generates a new key using the salt as a base, and reduces the target key range of the {@link + * #wrapped} {@link KeySelector} to {@link #writeParallelism}. + */ + private static class TargetLimitedKeySelector implements KeySelector { + private final KeySelector wrapped; + private final int writeParallelism; + private final int[] distinctKeys; + + @SuppressWarnings("checkstyle:ParameterAssignment") + TargetLimitedKeySelector( + KeySelector wrapped, + String tableName, + int writeParallelism, + int maxWriteParallelism) { + if (writeParallelism > maxWriteParallelism) { + LOG.warn( + "{}: writeParallelism {} is greater than maxWriteParallelism {}. Capping writeParallelism at {}", + tableName, + writeParallelism, + maxWriteParallelism, + maxWriteParallelism); + writeParallelism = maxWriteParallelism; + } + this.wrapped = wrapped; + this.writeParallelism = writeParallelism; + this.distinctKeys = new int[writeParallelism]; + + // Ensures that the generated keys are always result in unique slotId + Set targetSlots = Sets.newHashSetWithExpectedSize(writeParallelism); + int nextKey = tableName.hashCode(); + for (int i = 0; i < writeParallelism; ++i) { + int subtaskId = subtaskId(nextKey, writeParallelism, maxWriteParallelism); + while (targetSlots.contains(subtaskId)) { + ++nextKey; + subtaskId = subtaskId(nextKey, writeParallelism, maxWriteParallelism); + } + + targetSlots.add(subtaskId); + distinctKeys[i] = nextKey; + ++nextKey; + } + } + + @Override + public Integer getKey(RowData value) throws Exception { + return distinctKeys[ + DynamicSinkUtil.safeAbs(wrapped.getKey(value).hashCode()) % writeParallelism]; + } + + private static int subtaskId(int key, int writeParallelism, int maxWriteParallelism) { + return KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup( + maxWriteParallelism, + writeParallelism, + KeyGroupRangeAssignment.computeKeyGroupForKeyHash(key, maxWriteParallelism)); + } + } + + /** + * Generates evenly distributed keys between [0..{@link #maxTarget}) range using round-robin + * algorithm. + * + * @param unused input for key generation + */ + private static class RoundRobinKeySelector implements KeySelector { + private final int maxTarget; + private int lastTarget = 0; + + RoundRobinKeySelector(int maxTarget) { + this.maxTarget = maxTarget; + } + + @Override + public Integer getKey(T value) { + lastTarget = (lastTarget + 1) % maxTarget; + return lastTarget; + } + } + + /** + * Cache key for the {@link KeySelector}. Only contains the {@link Schema} and the {@link + * PartitionSpec} if their ids are not provided. + */ + static class SelectorKey { + private final String tableName; + private final String branch; + private final Integer schemaId; + private final Integer specId; + private final Schema schema; + private final PartitionSpec spec; + private final Set equalityFields; + + SelectorKey( + String tableName, + String branch, + @Nullable Integer tableSchemaId, + @Nullable Integer tableSpecId, + Schema schema, + PartitionSpec spec, + Set equalityFields) { + this.tableName = tableName; + this.branch = branch; + this.schemaId = tableSchemaId; + this.specId = tableSpecId; + this.schema = tableSchemaId == null ? schema : null; + this.spec = tableSpecId == null ? spec : null; + this.equalityFields = equalityFields; + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + + if (other == null || getClass() != other.getClass()) { + return false; + } + + SelectorKey that = (SelectorKey) other; + return Objects.equals(tableName, that.tableName) + && Objects.equals(branch, that.branch) + && Objects.equals(schemaId, that.schemaId) + && Objects.equals(specId, that.specId) + && Objects.equals(schema, that.schema) + && Objects.equals(spec, that.spec) + && Objects.equals(equalityFields, that.equalityFields); + } + + @Override + public int hashCode() { + return Objects.hash(tableName, branch, schemaId, specId, schema, spec, equalityFields); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("tableName", tableName) + .add("branch", branch) + .add("schemaId", schemaId) + .add("specId", specId) + .add("schema", schema) + .add("spec", spec) + .add("equalityFields", equalityFields) + .toString(); + } + } + + @VisibleForTesting + Map> getKeySelectorCache() { + return keySelectorCache; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/LRUCache.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/LRUCache.java new file mode 100644 index 000000000000..be2866dc4e19 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/LRUCache.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.function.Consumer; + +/** + * A performant, fixed size least recently used (LRU) cache implementation. + * + *

This cache has O(1) time complexity for get/put operations and provides eviction notifications + * when entries are removed due to size constraints. It offers better performance than similarly + * configured Caffeine caches, making it ideal for hot path operations. + * + *

This implementation extends {@link LinkedHashMap} with access-order traversal and automated + * removal of least recently used entries when the maximum size is reached. + */ +@SuppressWarnings("checkstyle:IllegalType") +class LRUCache extends LinkedHashMap { + /** Defaults from {@link java.util.HashMap} */ + private static final int DEFAULT_INITIAL_CAPACITY = 1 << 4; + + private static final float DEFAULT_LOAD_FACTOR = 0.75f; + + private final int maximumSize; + private final Consumer> evictionCallback; + + LRUCache(int maximumSize) { + this(maximumSize, ignored -> {}); + } + + LRUCache(int maximumSize, Consumer> evictionCallback) { + super(Math.min(maximumSize, DEFAULT_INITIAL_CAPACITY), DEFAULT_LOAD_FACTOR, true); + this.maximumSize = maximumSize; + this.evictionCallback = evictionCallback; + } + + @Override + protected boolean removeEldestEntry(Map.Entry eldest) { + boolean remove = size() > maximumSize; + if (remove) { + evictionCallback.accept(eldest); + } + + return remove; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/PartitionSpecEvolution.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/PartitionSpecEvolution.java new file mode 100644 index 000000000000..90b6c7295cb7 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/PartitionSpecEvolution.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.List; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.expressions.Term; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +/** Checks compatibility of PartitionSpecs and evolves one into the other. */ +public class PartitionSpecEvolution { + + private PartitionSpecEvolution() {} + + /** + * Checks whether two PartitionSpecs are compatible with each other. Less strict than {@code + * PartitionSpec#compatible} in the sense that it tolerates differently named partition fields, as + * long as their transforms and field names corresponding to their source ids match. + */ + public static boolean checkCompatibility(PartitionSpec spec1, PartitionSpec spec2) { + if (spec1.equals(spec2)) { + return true; + } + + if (spec1.fields().size() != spec2.fields().size()) { + return false; + } + + for (int i = 0; i < spec1.fields().size(); i++) { + PartitionField field1 = spec1.fields().get(i); + PartitionField field2 = spec2.fields().get(i); + if (!specFieldsAreCompatible(field1, spec1.schema(), field2, spec2.schema())) { + return false; + } + } + + return true; + } + + static PartitionSpecChanges evolve(PartitionSpec currentSpec, PartitionSpec targetSpec) { + if (currentSpec.compatibleWith(targetSpec)) { + return new PartitionSpecChanges(); + } + + PartitionSpecChanges result = new PartitionSpecChanges(); + + int maxNumFields = Math.max(currentSpec.fields().size(), targetSpec.fields().size()); + for (int i = 0; i < maxNumFields; i++) { + PartitionField currentField = Iterables.get(currentSpec.fields(), i, null); + PartitionField targetField = Iterables.get(targetSpec.fields(), i, null); + + if (!specFieldsAreCompatible( + currentField, currentSpec.schema(), targetField, targetSpec.schema())) { + + if (currentField != null) { + result.remove(toTerm(currentField, currentSpec.schema())); + } + + if (targetField != null) { + result.add(toTerm(targetField, targetSpec.schema())); + } + } + } + + return result; + } + + static class PartitionSpecChanges { + private final List termsToAdd = Lists.newArrayList(); + private final List termsToRemove = Lists.newArrayList(); + + public void add(Term term) { + termsToAdd.add(term); + } + + public void remove(Term term) { + termsToRemove.add(term); + } + + public List termsToAdd() { + return termsToAdd; + } + + public List termsToRemove() { + return termsToRemove; + } + + public boolean isEmpty() { + return termsToAdd.isEmpty() && termsToRemove.isEmpty(); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(PartitionSpecEvolution.class) + .add("termsToAdd", termsToAdd) + .add("termsToRemove", termsToRemove) + .toString(); + } + } + + private static Term toTerm(PartitionField field, Schema schema) { + String sourceName = schema.idToName().get(field.sourceId()); + return Expressions.transform(sourceName, field.transform()); + } + + private static boolean specFieldsAreCompatible( + PartitionField field1, Schema schemaField1, PartitionField field2, Schema schemaField2) { + if (field1 == null || field2 == null) { + return false; + } + String firstFieldSourceName = schemaField1.idToName().get(field1.sourceId()); + String secondFieldSourceName = schemaField2.idToName().get(field2.sourceId()); + return firstFieldSourceName.equals(secondFieldSourceName) + && field1.transform().toString().equals(field2.transform().toString()); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableMetadataCache.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableMetadataCache.java new file mode 100644 index 000000000000..85a5a4abf29c --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableMetadataCache.java @@ -0,0 +1,272 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.Map; +import java.util.Set; +import org.apache.flink.annotation.Internal; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.exceptions.NoSuchTableException; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * TableMetadataCache is responsible for caching table metadata to avoid hitting the catalog too + * frequently. We store table identifier, schema, partition spec, and a set of past schema + * comparison results of the active table schema against the last input schemas. + */ +@Internal +class TableMetadataCache { + + private static final Logger LOG = LoggerFactory.getLogger(TableMetadataCache.class); + private static final Tuple2 EXISTS = Tuple2.of(true, null); + private static final Tuple2 NOT_EXISTS = Tuple2.of(false, null); + static final ResolvedSchemaInfo NOT_FOUND = + new ResolvedSchemaInfo( + null, CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED, DataConverter.identity()); + + private final Catalog catalog; + private final long refreshMs; + private final int inputSchemasPerTableCacheMaximumSize; + private final Map tableCache; + + TableMetadataCache( + Catalog catalog, int maximumSize, long refreshMs, int inputSchemasPerTableCacheMaximumSize) { + this.catalog = catalog; + this.refreshMs = refreshMs; + this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaximumSize; + this.tableCache = new LRUCache<>(maximumSize); + } + + Tuple2 exists(TableIdentifier identifier) { + CacheItem cached = tableCache.get(identifier); + if (cached != null && Boolean.TRUE.equals(cached.tableExists)) { + return EXISTS; + } else if (needsRefresh(cached, true)) { + return refreshTable(identifier); + } else { + return NOT_EXISTS; + } + } + + String branch(TableIdentifier identifier, String branch) { + return branch(identifier, branch, true); + } + + ResolvedSchemaInfo schema(TableIdentifier identifier, Schema input) { + return schema(identifier, input, true); + } + + PartitionSpec spec(TableIdentifier identifier, PartitionSpec spec) { + return spec(identifier, spec, true); + } + + void update(TableIdentifier identifier, Table table) { + tableCache.put( + identifier, + new CacheItem( + true, + table.refs().keySet(), + table.schemas(), + table.specs(), + inputSchemasPerTableCacheMaximumSize)); + } + + private String branch(TableIdentifier identifier, String branch, boolean allowRefresh) { + CacheItem cached = tableCache.get(identifier); + if (cached != null && cached.tableExists && cached.branches.contains(branch)) { + return branch; + } + + if (needsRefresh(cached, allowRefresh)) { + refreshTable(identifier); + return branch(identifier, branch, false); + } else { + return null; + } + } + + private ResolvedSchemaInfo schema( + TableIdentifier identifier, Schema input, boolean allowRefresh) { + CacheItem cached = tableCache.get(identifier); + Schema compatible = null; + if (cached != null && cached.tableExists) { + // This only works if the {@link Schema#equals(Object)} returns true for the old schema + // and a new schema. Performance is paramount as this code is on the hot path. Every other + // way for comparing 2 schemas were performing worse than the + // {@link CompareByNameVisitor#visit(Schema, Schema, boolean)}, so caching was useless. + ResolvedSchemaInfo lastResult = cached.inputSchemas.get(input); + if (lastResult != null) { + return lastResult; + } + + for (Map.Entry tableSchema : cached.tableSchemas.entrySet()) { + CompareSchemasVisitor.Result result = + CompareSchemasVisitor.visit(input, tableSchema.getValue(), true); + if (result == CompareSchemasVisitor.Result.SAME) { + ResolvedSchemaInfo newResult = + new ResolvedSchemaInfo( + tableSchema.getValue(), + CompareSchemasVisitor.Result.SAME, + DataConverter.identity()); + cached.inputSchemas.put(input, newResult); + return newResult; + } else if (compatible == null + && result == CompareSchemasVisitor.Result.DATA_CONVERSION_NEEDED) { + compatible = tableSchema.getValue(); + } + } + } + + if (needsRefresh(cached, allowRefresh)) { + refreshTable(identifier); + return schema(identifier, input, false); + } else if (compatible != null) { + ResolvedSchemaInfo newResult = + new ResolvedSchemaInfo( + compatible, + CompareSchemasVisitor.Result.DATA_CONVERSION_NEEDED, + DataConverter.get( + FlinkSchemaUtil.convert(input), FlinkSchemaUtil.convert(compatible))); + cached.inputSchemas.put(input, newResult); + return newResult; + } else if (cached != null && cached.tableExists) { + cached.inputSchemas.put(input, NOT_FOUND); + return NOT_FOUND; + } else { + return NOT_FOUND; + } + } + + private PartitionSpec spec(TableIdentifier identifier, PartitionSpec spec, boolean allowRefresh) { + CacheItem cached = tableCache.get(identifier); + if (cached != null && cached.tableExists) { + for (PartitionSpec tableSpec : cached.specs.values()) { + if (PartitionSpecEvolution.checkCompatibility(tableSpec, spec)) { + return tableSpec; + } + } + } + + if (needsRefresh(cached, allowRefresh)) { + refreshTable(identifier); + return spec(identifier, spec, false); + } else { + return null; + } + } + + private Tuple2 refreshTable(TableIdentifier identifier) { + try { + Table table = catalog.loadTable(identifier); + update(identifier, table); + return EXISTS; + } catch (NoSuchTableException e) { + LOG.debug("Table doesn't exist {}", identifier, e); + tableCache.put(identifier, new CacheItem(false, null, null, null, 1)); + return Tuple2.of(false, e); + } + } + + private boolean needsRefresh(CacheItem cacheItem, boolean allowRefresh) { + return allowRefresh + && (cacheItem == null || cacheItem.created + refreshMs > System.currentTimeMillis()); + } + + public void invalidate(TableIdentifier identifier) { + tableCache.remove(identifier); + } + + /** Handles timeout for missing items only. Caffeine performance causes noticeable delays. */ + static class CacheItem { + private final long created = System.currentTimeMillis(); + + private final boolean tableExists; + private final Set branches; + private final Map tableSchemas; + private final Map specs; + private final Map inputSchemas; + + private CacheItem( + boolean tableExists, + Set branches, + Map tableSchemas, + Map specs, + int inputSchemaCacheMaximumSize) { + this.tableExists = tableExists; + this.branches = branches; + this.tableSchemas = tableSchemas; + this.specs = specs; + this.inputSchemas = + new LRUCache<>(inputSchemaCacheMaximumSize, CacheItem::inputSchemaEvictionListener); + } + + private static void inputSchemaEvictionListener( + Map.Entry evictedEntry) { + LOG.warn( + "Performance degraded as records with different schema is generated for the same table. " + + "Likely the DynamicRecord.schema is not reused. " + + "Reuse the same instance if the record schema is the same to improve performance"); + } + + @VisibleForTesting + Map inputSchemas() { + return inputSchemas; + } + } + + static class ResolvedSchemaInfo { + private final Schema resolvedTableSchema; + private final CompareSchemasVisitor.Result compareResult; + private final DataConverter recordConverter; + + ResolvedSchemaInfo( + Schema tableSchema, + CompareSchemasVisitor.Result compareResult, + DataConverter recordConverter) { + this.resolvedTableSchema = tableSchema; + this.compareResult = compareResult; + this.recordConverter = recordConverter; + } + + Schema resolvedTableSchema() { + return resolvedTableSchema; + } + + CompareSchemasVisitor.Result compareResult() { + return compareResult; + } + + DataConverter recordConverter() { + return recordConverter; + } + } + + @VisibleForTesting + Map getInternalCache() { + return tableCache; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableSerializerCache.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableSerializerCache.java new file mode 100644 index 000000000000..84d0ed9be5d0 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableSerializerCache.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.io.Serializable; +import java.util.Map; +import javax.annotation.Nullable; +import org.apache.flink.annotation.Internal; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.java.tuple.Tuple3; +import org.apache.flink.table.runtime.typeutils.RowDataSerializer; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.CatalogLoader; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +/** + * A Cache which holds Flink's {@link RowDataSerializer} for a given table name and schema. This + * avoids re-creating the serializer for a given table schema for every incoming record. + * + *

There is an additional optimization built into this class: Users do not have to supply the + * full schema / spec, but can also provide their id. This avoids transferring the schema / spec for + * every record. If the id is unknown, the schema / spec will be retrieved from the catalog. + * + *

Note that the caller must ensure that ids are only used for known schemas / specs. The id + * optimization must not be used in the update path. + */ +@Internal +class TableSerializerCache implements Serializable { + + private final CatalogLoader catalogLoader; + private final int maximumSize; + private transient Map serializers; + + TableSerializerCache(CatalogLoader catalogLoader, int maximumSize) { + this.catalogLoader = catalogLoader; + this.maximumSize = maximumSize; + } + + RowDataSerializer serializer(String tableName, Schema schema, PartitionSpec spec) { + return serializer(tableName, schema, spec, null, null).f0; + } + + Tuple3 serializerWithSchemaAndSpec( + String tableName, Integer schemaId, Integer specId) { + return serializer(tableName, null, null, schemaId, specId); + } + + private Tuple3 serializer( + String tableName, + @Nullable Schema unknownSchema, + @Nullable PartitionSpec unknownSpec, + @Nullable Integer schemaId, + @Nullable Integer specId) { + Preconditions.checkState( + (unknownSchema == null && unknownSpec == null) ^ (schemaId == null && specId == null), + "Either the full schema/spec or their ids must be provided."); + + if (serializers == null) { + // We need to initialize the cache at the first time + this.serializers = new LRUCache<>(maximumSize); + } + + SerializerInfo info = serializers.computeIfAbsent(tableName, SerializerInfo::new); + Schema schema = unknownSchema != null ? unknownSchema : info.schemas.get(schemaId); + PartitionSpec spec = unknownSpec != null ? unknownSpec : info.specs.get(specId); + + if (schema == null || spec == null) { + info.update(); + schema = info.schemas.get(schemaId); + spec = info.specs.get(specId); + } + + RowDataSerializer serializer = + info.serializers.computeIfAbsent( + schema, s -> new RowDataSerializer(FlinkSchemaUtil.convert(s))); + + return Tuple3.of(serializer, schema, spec); + } + + CatalogLoader catalogLoader() { + return catalogLoader; + } + + int maximumSize() { + return maximumSize; + } + + private class SerializerInfo { + private final String tableName; + private final Map serializers; + private Map schemas; + private Map specs; + + SerializerInfo(String tableName) { + this.tableName = tableName; + this.serializers = Maps.newHashMapWithExpectedSize(2); + this.schemas = Maps.newHashMapWithExpectedSize(1); + this.specs = Maps.newHashMapWithExpectedSize(0); + } + + private void update() { + Table table = catalogLoader.loadCatalog().loadTable(TableIdentifier.parse(tableName)); + schemas = table.schemas(); + specs = table.specs(); + } + } + + @VisibleForTesting + Map getCache() { + return serializers; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableUpdater.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableUpdater.java new file mode 100644 index 000000000000..fdd182830b2c --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableUpdater.java @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.tuple.Tuple3; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.UpdatePartitionSpec; +import org.apache.iceberg.UpdateSchema; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.SupportsNamespaces; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.exceptions.AlreadyExistsException; +import org.apache.iceberg.exceptions.CommitFailedException; +import org.apache.iceberg.exceptions.NoSuchNamespaceException; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Updates the Iceberg tables in case of schema, branch, or partition changes. */ +@Internal +class TableUpdater { + + private static final Logger LOG = LoggerFactory.getLogger(TableUpdater.class); + private final TableMetadataCache cache; + private final Catalog catalog; + + TableUpdater(TableMetadataCache cache, Catalog catalog) { + this.cache = cache; + this.catalog = catalog; + } + + /** + * Creates or updates a table to make sure that the given branch, schema, spec exists. + * + * @return a {@link Tuple3} of the new {@link Schema}, the status of the schema compared to the + * requested one, and the new {@link PartitionSpec#specId()}. + */ + Tuple2 update( + TableIdentifier tableIdentifier, String branch, Schema schema, PartitionSpec spec) { + findOrCreateTable(tableIdentifier, schema, spec); + findOrCreateBranch(tableIdentifier, branch); + TableMetadataCache.ResolvedSchemaInfo newSchemaInfo = + findOrCreateSchema(tableIdentifier, schema); + PartitionSpec newSpec = findOrCreateSpec(tableIdentifier, spec); + return Tuple2.of(newSchemaInfo, newSpec); + } + + private void findOrCreateTable(TableIdentifier identifier, Schema schema, PartitionSpec spec) { + Tuple2 exists = cache.exists(identifier); + if (Boolean.FALSE.equals(exists.f0)) { + if (exists.f1 instanceof NoSuchNamespaceException) { + SupportsNamespaces catalogWithNameSpace = (SupportsNamespaces) catalog; + LOG.info("Namespace {} not found during table search. Creating namespace", identifier); + try { + catalogWithNameSpace.createNamespace(identifier.namespace()); + } catch (AlreadyExistsException e) { + LOG.debug("Namespace {} created concurrently", identifier.namespace(), e); + } + } + + LOG.info("Table {} not found during table search. Creating table.", identifier); + try { + Table table = catalog.createTable(identifier, schema, spec); + cache.update(identifier, table); + } catch (AlreadyExistsException e) { + LOG.debug("Table {} created concurrently. Skipping creation.", identifier, e); + cache.invalidate(identifier); + findOrCreateTable(identifier, schema, spec); + } + } + } + + private void findOrCreateBranch(TableIdentifier identifier, String branch) { + String fromCache = cache.branch(identifier, branch); + if (fromCache == null) { + Table table = catalog.loadTable(identifier); + try { + table.manageSnapshots().createBranch(branch).commit(); + LOG.info("Branch {} for {} created", branch, identifier); + } catch (CommitFailedException e) { + table.refresh(); + if (table.refs().containsKey(branch)) { + LOG.debug("Branch {} concurrently created for {}.", branch, identifier); + } else { + LOG.error("Failed to create branch {} for {}.", branch, identifier, e); + throw e; + } + } + + cache.update(identifier, table); + } + } + + private TableMetadataCache.ResolvedSchemaInfo findOrCreateSchema( + TableIdentifier identifier, Schema schema) { + TableMetadataCache.ResolvedSchemaInfo fromCache = cache.schema(identifier, schema); + if (fromCache.compareResult() != CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED) { + return fromCache; + } else { + Table table = catalog.loadTable(identifier); + Schema tableSchema = table.schema(); + CompareSchemasVisitor.Result result = CompareSchemasVisitor.visit(schema, tableSchema, true); + switch (result) { + case SAME: + cache.update(identifier, table); + return new TableMetadataCache.ResolvedSchemaInfo( + tableSchema, result, DataConverter.identity()); + case DATA_CONVERSION_NEEDED: + cache.update(identifier, table); + return new TableMetadataCache.ResolvedSchemaInfo( + tableSchema, + result, + DataConverter.get( + FlinkSchemaUtil.convert(schema), FlinkSchemaUtil.convert(tableSchema))); + case SCHEMA_UPDATE_NEEDED: + LOG.info( + "Triggering schema update for table {} {} to {}", identifier, tableSchema, schema); + UpdateSchema updateApi = table.updateSchema(); + EvolveSchemaVisitor.visit(updateApi, tableSchema, schema); + + try { + updateApi.commit(); + cache.update(identifier, table); + TableMetadataCache.ResolvedSchemaInfo comparisonAfterMigration = + cache.schema(identifier, schema); + Schema newSchema = comparisonAfterMigration.resolvedTableSchema(); + LOG.info("Table {} schema updated from {} to {}", identifier, tableSchema, newSchema); + return comparisonAfterMigration; + } catch (CommitFailedException e) { + cache.invalidate(identifier); + TableMetadataCache.ResolvedSchemaInfo newSchema = cache.schema(identifier, schema); + if (newSchema.compareResult() != CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED) { + LOG.debug("Table {} schema updated concurrently to {}", identifier, schema); + return newSchema; + } else { + LOG.error( + "Schema update failed for {} from {} to {}", identifier, tableSchema, schema, e); + throw e; + } + } + default: + throw new IllegalArgumentException("Unknown comparison result"); + } + } + } + + private PartitionSpec findOrCreateSpec(TableIdentifier identifier, PartitionSpec targetSpec) { + PartitionSpec currentSpec = cache.spec(identifier, targetSpec); + if (currentSpec != null) { + return currentSpec; + } + + Table table = catalog.loadTable(identifier); + currentSpec = table.spec(); + + PartitionSpecEvolution.PartitionSpecChanges result = + PartitionSpecEvolution.evolve(currentSpec, targetSpec); + if (result.isEmpty()) { + LOG.info("Returning equivalent existing spec {} for {}", currentSpec, targetSpec); + return currentSpec; + } + + LOG.info( + "Spec for table {} has been altered. Updating from {} to {}", + identifier, + currentSpec, + targetSpec); + UpdatePartitionSpec updater = table.updateSpec(); + result.termsToRemove().forEach(updater::removeField); + result.termsToAdd().forEach(updater::addField); + + try { + updater.commit(); + cache.update(identifier, table); + } catch (CommitFailedException e) { + cache.invalidate(identifier); + PartitionSpec newSpec = cache.spec(identifier, targetSpec); + result = PartitionSpecEvolution.evolve(targetSpec, newSpec); + if (result.isEmpty()) { + LOG.debug("Table {} partition spec updated concurrently to {}", identifier, newSpec); + return newSpec; + } else { + LOG.error( + "Partition spec update failed for {} from {} to {}", + identifier, + currentSpec, + targetSpec, + e); + throw e; + } + } + return cache.spec(identifier, targetSpec); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/WriteTarget.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/WriteTarget.java new file mode 100644 index 000000000000..afd5b637e933 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/WriteTarget.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.io.IOException; +import java.io.Serializable; +import java.util.Objects; +import java.util.Set; +import org.apache.flink.core.memory.DataInputView; +import org.apache.flink.core.memory.DataOutputView; +import org.apache.hadoop.util.Sets; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; + +class WriteTarget implements Serializable { + + private final String tableName; + private final String branch; + private final Integer schemaId; + private final Integer specId; + private final boolean upsertMode; + private final Set equalityFields; + + WriteTarget( + String tableName, + String branch, + Integer schemaId, + Integer specId, + boolean upsertMode, + Set equalityFields) { + this.tableName = tableName; + this.branch = branch != null ? branch : "main"; + this.schemaId = schemaId; + this.specId = specId; + this.upsertMode = upsertMode; + this.equalityFields = equalityFields; + } + + String tableName() { + return tableName; + } + + String branch() { + return branch; + } + + Integer schemaId() { + return schemaId; + } + + Integer specId() { + return specId; + } + + boolean upsertMode() { + return upsertMode; + } + + Set equalityFields() { + return equalityFields; + } + + void serializeTo(DataOutputView view) throws IOException { + view.writeUTF(tableName); + view.writeUTF(branch); + view.writeInt(schemaId); + view.writeInt(specId); + view.writeBoolean(upsertMode); + view.writeInt(equalityFields.size()); + for (Integer equalityField : equalityFields) { + view.writeInt(equalityField); + } + } + + static WriteTarget deserializeFrom(DataInputView view) throws IOException { + return new WriteTarget( + view.readUTF(), + view.readUTF(), + view.readInt(), + view.readInt(), + view.readBoolean(), + readSet(view)); + } + + private static Set readSet(DataInputView view) throws IOException { + int numFields = view.readInt(); + Set equalityFields = Sets.newHashSetWithExpectedSize(numFields); + for (int i = 0; i < numFields; i++) { + equalityFields.add(view.readInt()); + } + + return equalityFields; + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + + if (other == null || getClass() != other.getClass()) { + return false; + } + + WriteTarget that = (WriteTarget) other; + return Objects.equals(tableName, that.tableName) + && Objects.equals(branch, that.branch) + && Objects.equals(schemaId, that.schemaId) + && Objects.equals(specId, that.specId) + && upsertMode == that.upsertMode + && Objects.equals(equalityFields, that.equalityFields); + } + + @Override + public int hashCode() { + return Objects.hash(tableName, branch, schemaId, specId, upsertMode, equalityFields); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("tableName", tableName) + .add("branch", branch) + .add("schemaId", schemaId) + .add("specId", specId) + .add("upsertMode", upsertMode) + .toString(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java new file mode 100644 index 000000000000..95c2328f032a --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java @@ -0,0 +1,264 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Map; +import java.util.NavigableMap; +import java.util.Set; +import javax.annotation.Nullable; +import org.apache.datasketches.sampling.ReservoirItemsSketch; +import org.apache.datasketches.sampling.ReservoirItemsUnion; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * AggregatedStatisticsTracker tracks the statistics aggregation received from {@link + * DataStatisticsOperator} subtasks for every checkpoint. + */ +class AggregatedStatisticsTracker { + private static final Logger LOG = LoggerFactory.getLogger(AggregatedStatisticsTracker.class); + + private final String operatorName; + private final int parallelism; + private final TypeSerializer statisticsSerializer; + private final int downstreamParallelism; + private final StatisticsType statisticsType; + private final int switchToSketchThreshold; + private final NavigableMap aggregationsPerCheckpoint; + + private CompletedStatistics completedStatistics; + + AggregatedStatisticsTracker( + String operatorName, + int parallelism, + Schema schema, + SortOrder sortOrder, + int downstreamParallelism, + StatisticsType statisticsType, + int switchToSketchThreshold, + @Nullable CompletedStatistics restoredStatistics) { + this.operatorName = operatorName; + this.parallelism = parallelism; + this.statisticsSerializer = + new DataStatisticsSerializer(new SortKeySerializer(schema, sortOrder)); + this.downstreamParallelism = downstreamParallelism; + this.statisticsType = statisticsType; + this.switchToSketchThreshold = switchToSketchThreshold; + this.completedStatistics = restoredStatistics; + + this.aggregationsPerCheckpoint = Maps.newTreeMap(); + } + + CompletedStatistics updateAndCheckCompletion(int subtask, StatisticsEvent event) { + long checkpointId = event.checkpointId(); + LOG.debug( + "Handling statistics event from subtask {} of operator {} for checkpoint {}", + subtask, + operatorName, + checkpointId); + + if (completedStatistics != null && completedStatistics.checkpointId() > checkpointId) { + LOG.info( + "Ignore stale statistics event from operator {} subtask {} for older checkpoint {}. " + + "Was expecting data statistics from checkpoint higher than {}", + operatorName, + subtask, + checkpointId, + completedStatistics.checkpointId()); + return null; + } + + Aggregation aggregation = + aggregationsPerCheckpoint.computeIfAbsent( + checkpointId, + ignored -> + new Aggregation( + parallelism, + downstreamParallelism, + switchToSketchThreshold, + statisticsType, + StatisticsUtil.collectType(statisticsType, completedStatistics))); + DataStatistics dataStatistics = + StatisticsUtil.deserializeDataStatistics(event.statisticsBytes(), statisticsSerializer); + if (!aggregation.merge(subtask, dataStatistics)) { + LOG.debug( + "Ignore duplicate data statistics from operator {} subtask {} for checkpoint {}.", + operatorName, + subtask, + checkpointId); + } + + if (aggregation.isComplete()) { + this.completedStatistics = aggregation.completedStatistics(checkpointId); + // clean up aggregations up to the completed checkpoint id + aggregationsPerCheckpoint.headMap(checkpointId, true).clear(); + return completedStatistics; + } + + return null; + } + + @VisibleForTesting + NavigableMap aggregationsPerCheckpoint() { + return aggregationsPerCheckpoint; + } + + static class Aggregation { + private static final Logger LOG = LoggerFactory.getLogger(Aggregation.class); + + private final Set subtaskSet; + private final int parallelism; + private final int downstreamParallelism; + private final int switchToSketchThreshold; + private final StatisticsType configuredType; + private StatisticsType currentType; + private Map mapStatistics; + private ReservoirItemsUnion sketchStatistics; + + Aggregation( + int parallelism, + int downstreamParallelism, + int switchToSketchThreshold, + StatisticsType configuredType, + StatisticsType currentType) { + this.subtaskSet = Sets.newHashSet(); + this.parallelism = parallelism; + this.downstreamParallelism = downstreamParallelism; + this.switchToSketchThreshold = switchToSketchThreshold; + this.configuredType = configuredType; + this.currentType = currentType; + + if (currentType == StatisticsType.Map) { + this.mapStatistics = Maps.newHashMap(); + this.sketchStatistics = null; + } else { + this.mapStatistics = null; + this.sketchStatistics = + ReservoirItemsUnion.newInstance( + SketchUtil.determineCoordinatorReservoirSize(downstreamParallelism)); + } + } + + @VisibleForTesting + Set subtaskSet() { + return subtaskSet; + } + + @VisibleForTesting + StatisticsType currentType() { + return currentType; + } + + @VisibleForTesting + Map mapStatistics() { + return mapStatistics; + } + + @VisibleForTesting + ReservoirItemsUnion sketchStatistics() { + return sketchStatistics; + } + + private boolean isComplete() { + return subtaskSet.size() == parallelism; + } + + /** + * @return false if duplicate + */ + private boolean merge(int subtask, DataStatistics taskStatistics) { + if (subtaskSet.contains(subtask)) { + return false; + } + + subtaskSet.add(subtask); + merge(taskStatistics); + return true; + } + + @SuppressWarnings("unchecked") + private void merge(DataStatistics taskStatistics) { + if (taskStatistics.type() == StatisticsType.Map) { + Map taskMapStats = (Map) taskStatistics.result(); + if (currentType == StatisticsType.Map) { + taskMapStats.forEach((key, count) -> mapStatistics.merge(key, count, Long::sum)); + if (configuredType == StatisticsType.Auto + && mapStatistics.size() > switchToSketchThreshold) { + convertCoordinatorToSketch(); + } + } else { + // convert task stats to sketch first + ReservoirItemsSketch taskSketch = + ReservoirItemsSketch.newInstance( + SketchUtil.determineOperatorReservoirSize(parallelism, downstreamParallelism)); + SketchUtil.convertMapToSketch(taskMapStats, taskSketch::update); + sketchStatistics.update(taskSketch); + } + } else { + ReservoirItemsSketch taskSketch = + (ReservoirItemsSketch) taskStatistics.result(); + if (currentType == StatisticsType.Map) { + // convert global stats to sketch first + convertCoordinatorToSketch(); + } + + if (taskSketch.getNumSamples() > 0) { + sketchStatistics.update(taskSketch); + } + } + } + + private void convertCoordinatorToSketch() { + this.sketchStatistics = + ReservoirItemsUnion.newInstance( + SketchUtil.determineCoordinatorReservoirSize(downstreamParallelism)); + SketchUtil.convertMapToSketch(mapStatistics, sketchStatistics::update); + this.currentType = StatisticsType.Sketch; + this.mapStatistics = null; + } + + private CompletedStatistics completedStatistics(long checkpointId) { + if (currentType == StatisticsType.Map) { + LOG.info("Completed map statistics aggregation with {} keys", mapStatistics.size()); + return CompletedStatistics.fromKeyFrequency(checkpointId, mapStatistics); + } else { + ReservoirItemsSketch sketch = sketchStatistics.getResult(); + if (sketch != null) { + LOG.info( + "Completed sketch statistics aggregation: " + + "reservoir size = {}, number of items seen = {}, number of samples = {}", + sketch.getK(), + sketch.getN(), + sketch.getNumSamples()); + return CompletedStatistics.fromKeySamples(checkpointId, sketch.getSamples()); + } else { + LOG.info("Empty sketch statistics."); + return CompletedStatistics.fromKeySamples(checkpointId, new SortKey[0]); + } + } + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java new file mode 100644 index 000000000000..a8bf0f839e49 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Arrays; +import java.util.Map; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Objects; + +/** + * This is what {@link AggregatedStatisticsTracker} returns upon a completed statistics aggregation + * from all subtasks. It contains the raw statistics (Map or reservoir samples). + */ +class CompletedStatistics { + private final long checkpointId; + private final StatisticsType type; + private final Map keyFrequency; + private final SortKey[] keySamples; + + static CompletedStatistics fromKeyFrequency(long checkpointId, Map stats) { + return new CompletedStatistics(checkpointId, StatisticsType.Map, stats, null); + } + + static CompletedStatistics fromKeySamples(long checkpointId, SortKey[] keySamples) { + return new CompletedStatistics(checkpointId, StatisticsType.Sketch, null, keySamples); + } + + CompletedStatistics( + long checkpointId, + StatisticsType type, + Map keyFrequency, + SortKey[] keySamples) { + this.checkpointId = checkpointId; + this.type = type; + this.keyFrequency = keyFrequency; + this.keySamples = keySamples; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("checkpointId", checkpointId) + .add("type", type) + .add("keyFrequency", keyFrequency) + .add("keySamples", keySamples) + .toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (!(o instanceof CompletedStatistics)) { + return false; + } + + CompletedStatistics other = (CompletedStatistics) o; + return Objects.equal(checkpointId, other.checkpointId) + && Objects.equal(type, other.type) + && Objects.equal(keyFrequency, other.keyFrequency()) + && Arrays.equals(keySamples, other.keySamples()); + } + + @Override + public int hashCode() { + return Objects.hashCode(checkpointId, type, keyFrequency, keySamples); + } + + long checkpointId() { + return checkpointId; + } + + StatisticsType type() { + return type; + } + + Map keyFrequency() { + return keyFrequency; + } + + SortKey[] keySamples() { + return keySamples; + } + + boolean isEmpty() { + if (type == StatisticsType.Sketch) { + return keySamples.length == 0; + } else { + return keyFrequency().isEmpty(); + } + } + + boolean isValid() { + if (type == StatisticsType.Sketch) { + if (null == keySamples) { + return false; + } + } else { + if (null == keyFrequency()) { + return false; + } + if (keyFrequency().values().contains(null)) { + return false; + } + } + + return true; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java new file mode 100644 index 000000000000..48c85a9bd91e --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; +import org.apache.flink.api.common.typeutils.base.EnumSerializer; +import org.apache.flink.api.common.typeutils.base.ListSerializer; +import org.apache.flink.api.common.typeutils.base.LongSerializer; +import org.apache.flink.api.common.typeutils.base.MapSerializer; +import org.apache.flink.core.memory.DataInputView; +import org.apache.flink.core.memory.DataOutputView; +import org.apache.iceberg.SortKey; + +class CompletedStatisticsSerializer extends TypeSerializer { + private final TypeSerializer sortKeySerializer; + private final EnumSerializer statisticsTypeSerializer; + private final MapSerializer keyFrequencySerializer; + private final ListSerializer keySamplesSerializer; + + CompletedStatisticsSerializer(TypeSerializer sortKeySerializer) { + this.sortKeySerializer = sortKeySerializer; + this.statisticsTypeSerializer = new EnumSerializer<>(StatisticsType.class); + this.keyFrequencySerializer = new MapSerializer<>(sortKeySerializer, LongSerializer.INSTANCE); + this.keySamplesSerializer = new ListSerializer<>(sortKeySerializer); + } + + public void changeSortKeySerializerVersion(int version) { + if (sortKeySerializer instanceof SortKeySerializer) { + ((SortKeySerializer) sortKeySerializer).setVersion(version); + } + } + + public void changeSortKeySerializerVersionLatest() { + if (sortKeySerializer instanceof SortKeySerializer) { + ((SortKeySerializer) sortKeySerializer).restoreToLatestVersion(); + } + } + + @Override + public boolean isImmutableType() { + return false; + } + + @Override + public TypeSerializer duplicate() { + return new CompletedStatisticsSerializer(sortKeySerializer); + } + + @Override + public CompletedStatistics createInstance() { + return CompletedStatistics.fromKeyFrequency(0L, Collections.emptyMap()); + } + + @Override + public CompletedStatistics copy(CompletedStatistics from) { + return new CompletedStatistics( + from.checkpointId(), from.type(), from.keyFrequency(), from.keySamples()); + } + + @Override + public CompletedStatistics copy(CompletedStatistics from, CompletedStatistics reuse) { + // no benefit of reuse + return copy(from); + } + + @Override + public int getLength() { + return -1; + } + + @Override + public void serialize(CompletedStatistics record, DataOutputView target) throws IOException { + target.writeLong(record.checkpointId()); + statisticsTypeSerializer.serialize(record.type(), target); + if (record.type() == StatisticsType.Map) { + keyFrequencySerializer.serialize(record.keyFrequency(), target); + } else { + keySamplesSerializer.serialize(Arrays.asList(record.keySamples()), target); + } + } + + @Override + public CompletedStatistics deserialize(DataInputView source) throws IOException { + long checkpointId = source.readLong(); + StatisticsType type = statisticsTypeSerializer.deserialize(source); + if (type == StatisticsType.Map) { + Map keyFrequency = keyFrequencySerializer.deserialize(source); + return CompletedStatistics.fromKeyFrequency(checkpointId, keyFrequency); + } else { + List sortKeys = keySamplesSerializer.deserialize(source); + SortKey[] keySamples = new SortKey[sortKeys.size()]; + keySamples = sortKeys.toArray(keySamples); + return CompletedStatistics.fromKeySamples(checkpointId, keySamples); + } + } + + @Override + public CompletedStatistics deserialize(CompletedStatistics reuse, DataInputView source) + throws IOException { + // not much benefit to reuse + return deserialize(source); + } + + @Override + public void copy(DataInputView source, DataOutputView target) throws IOException { + serialize(deserialize(source), target); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + + if (obj == null || getClass() != obj.getClass()) { + return false; + } + + CompletedStatisticsSerializer other = (CompletedStatisticsSerializer) obj; + return Objects.equals(sortKeySerializer, other.sortKeySerializer); + } + + @Override + public int hashCode() { + return sortKeySerializer.hashCode(); + } + + @Override + public TypeSerializerSnapshot snapshotConfiguration() { + return new CompletedStatisticsSerializerSnapshot(this); + } + + public static class CompletedStatisticsSerializerSnapshot + extends CompositeTypeSerializerSnapshot { + private static final int CURRENT_VERSION = 1; + + /** Constructor for read instantiation. */ + @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) + public CompletedStatisticsSerializerSnapshot() {} + + @SuppressWarnings("checkstyle:RedundantModifier") + public CompletedStatisticsSerializerSnapshot(CompletedStatisticsSerializer serializer) { + super(serializer); + } + + @Override + protected int getCurrentOuterSnapshotVersion() { + return CURRENT_VERSION; + } + + @Override + protected TypeSerializer[] getNestedSerializers( + CompletedStatisticsSerializer outerSerializer) { + return new TypeSerializer[] {outerSerializer.sortKeySerializer}; + } + + @Override + protected CompletedStatisticsSerializer createOuterSerializerWithNestedSerializers( + TypeSerializer[] nestedSerializers) { + SortKeySerializer sortKeySerializer = (SortKeySerializer) nestedSerializers[0]; + return new CompletedStatisticsSerializer(sortKeySerializer); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java new file mode 100644 index 000000000000..76c59cd5f4b8 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Map; +import org.apache.datasketches.sampling.ReservoirItemsSketch; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.SortKey; + +/** + * DataStatistics defines the interface to collect data distribution information. + * + *

Data statistics tracks traffic volume distribution across data keys. For low-cardinality key, + * a simple map of (key, count) can be used. For high-cardinality key, probabilistic data structures + * (sketching) can be used. + */ +@Internal +interface DataStatistics { + + StatisticsType type(); + + boolean isEmpty(); + + /** Add row sortKey to data statistics. */ + void add(SortKey sortKey); + + /** + * Get the collected statistics. Could be a {@link Map} (low cardinality) or {@link + * ReservoirItemsSketch} (high cardinality) + */ + Object result(); +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java new file mode 100644 index 000000000000..773d0fe6c65a --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Comparator; +import java.util.Locale; +import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.ThreadFactory; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.util.ExceptionUtils; +import org.apache.flink.util.FatalExitExceptionHandler; +import org.apache.flink.util.FlinkRuntimeException; +import org.apache.flink.util.ThrowableCatchingRunnable; +import org.apache.flink.util.function.ThrowingRunnable; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Comparators; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * DataStatisticsCoordinator receives {@link StatisticsEvent} from {@link DataStatisticsOperator} + * every subtask and then merge them together. Once aggregation for all subtasks data statistics + * completes, DataStatisticsCoordinator will send the aggregated data statistics back to {@link + * DataStatisticsOperator}. In the end a custom partitioner will distribute traffic based on the + * aggregated data statistics to improve data clustering. + */ +@Internal +class DataStatisticsCoordinator implements OperatorCoordinator { + private static final Logger LOG = LoggerFactory.getLogger(DataStatisticsCoordinator.class); + + private final String operatorName; + private final OperatorCoordinator.Context context; + private final Schema schema; + private final SortOrder sortOrder; + private final Comparator comparator; + private final int downstreamParallelism; + private final StatisticsType statisticsType; + private final double closeFileCostWeightPercentage; + + private final ExecutorService coordinatorExecutor; + private final SubtaskGateways subtaskGateways; + private final CoordinatorExecutorThreadFactory coordinatorThreadFactory; + private final TypeSerializer completedStatisticsSerializer; + private final TypeSerializer globalStatisticsSerializer; + + private transient boolean started; + private transient AggregatedStatisticsTracker aggregatedStatisticsTracker; + private transient CompletedStatistics completedStatistics; + private transient GlobalStatistics globalStatistics; + + DataStatisticsCoordinator( + String operatorName, + OperatorCoordinator.Context context, + Schema schema, + SortOrder sortOrder, + int downstreamParallelism, + StatisticsType statisticsType, + double closeFileCostWeightPercentage) { + this.operatorName = operatorName; + this.context = context; + this.schema = schema; + this.sortOrder = sortOrder; + this.comparator = Comparators.forType(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()); + this.downstreamParallelism = downstreamParallelism; + this.statisticsType = statisticsType; + this.closeFileCostWeightPercentage = closeFileCostWeightPercentage; + + this.coordinatorThreadFactory = + new CoordinatorExecutorThreadFactory( + "DataStatisticsCoordinator-" + operatorName, context.getUserCodeClassloader()); + this.coordinatorExecutor = Executors.newSingleThreadExecutor(coordinatorThreadFactory); + this.subtaskGateways = new SubtaskGateways(operatorName, context.currentParallelism()); + SortKeySerializer sortKeySerializer = new SortKeySerializer(schema, sortOrder); + this.completedStatisticsSerializer = new CompletedStatisticsSerializer(sortKeySerializer); + this.globalStatisticsSerializer = new GlobalStatisticsSerializer(sortKeySerializer); + } + + @Override + public void start() throws Exception { + LOG.info("Starting data statistics coordinator: {}.", operatorName); + this.started = true; + + // statistics are restored already in resetToCheckpoint() before start() called + this.aggregatedStatisticsTracker = + new AggregatedStatisticsTracker( + operatorName, + context.currentParallelism(), + schema, + sortOrder, + downstreamParallelism, + statisticsType, + SketchUtil.COORDINATOR_SKETCH_SWITCH_THRESHOLD, + completedStatistics); + } + + @Override + public void close() throws Exception { + coordinatorExecutor.shutdown(); + this.aggregatedStatisticsTracker = null; + this.started = false; + LOG.info("Closed data statistics coordinator: {}.", operatorName); + } + + @VisibleForTesting + void callInCoordinatorThread(Callable callable, String errorMessage) { + ensureStarted(); + // Ensure the task is done by the coordinator executor. + if (!coordinatorThreadFactory.isCurrentThreadCoordinatorThread()) { + try { + Callable guardedCallable = + () -> { + try { + return callable.call(); + } catch (Throwable t) { + LOG.error( + "Uncaught Exception in data statistics coordinator: {} executor", + operatorName, + t); + ExceptionUtils.rethrowException(t); + return null; + } + }; + + coordinatorExecutor.submit(guardedCallable).get(); + } catch (InterruptedException | ExecutionException e) { + throw new FlinkRuntimeException(errorMessage, e); + } + } else { + try { + callable.call(); + } catch (Throwable t) { + LOG.error( + "Uncaught Exception in data statistics coordinator: {} executor", operatorName, t); + throw new FlinkRuntimeException(errorMessage, t); + } + } + } + + public void runInCoordinatorThread(Runnable runnable) { + this.coordinatorExecutor.execute( + new ThrowableCatchingRunnable( + throwable -> + this.coordinatorThreadFactory.uncaughtException(Thread.currentThread(), throwable), + runnable)); + } + + private void runInCoordinatorThread(ThrowingRunnable action, String actionString) { + ensureStarted(); + runInCoordinatorThread( + () -> { + try { + action.run(); + } catch (Throwable t) { + ExceptionUtils.rethrowIfFatalErrorOrOOM(t); + LOG.error( + "Uncaught exception in the data statistics coordinator: {} while {}. Triggering job failover", + operatorName, + actionString, + t); + context.failJob(t); + } + }); + } + + private void ensureStarted() { + Preconditions.checkState(started, "The coordinator of %s has not started yet.", operatorName); + } + + private void handleDataStatisticRequest(int subtask, StatisticsEvent event) { + CompletedStatistics maybeCompletedStatistics = + aggregatedStatisticsTracker.updateAndCheckCompletion(subtask, event); + + if (maybeCompletedStatistics != null) { + if (maybeCompletedStatistics.isEmpty()) { + LOG.info( + "Skip aggregated statistics for checkpoint {} as it is empty.", event.checkpointId()); + } else { + LOG.info("Completed statistics aggregation for checkpoint {}", event.checkpointId()); + // completedStatistics contains the complete samples, which is needed to compute + // the range bounds in globalStatistics if downstreamParallelism changed. + this.completedStatistics = maybeCompletedStatistics; + // globalStatistics only contains assignment calculated based on Map or Sketch statistics + this.globalStatistics = + globalStatistics( + maybeCompletedStatistics, + downstreamParallelism, + comparator, + closeFileCostWeightPercentage); + sendGlobalStatisticsToSubtasks(globalStatistics); + } + } + } + + private static GlobalStatistics globalStatistics( + CompletedStatistics completedStatistics, + int downstreamParallelism, + Comparator comparator, + double closeFileCostWeightPercentage) { + if (completedStatistics.type() == StatisticsType.Sketch) { + // range bound is a much smaller array compared to the complete samples. + // It helps reduce the amount of data transfer from coordinator to operator subtasks. + return GlobalStatistics.fromRangeBounds( + completedStatistics.checkpointId(), + SketchUtil.rangeBounds( + downstreamParallelism, comparator, completedStatistics.keySamples())); + } else { + return GlobalStatistics.fromMapAssignment( + completedStatistics.checkpointId(), + MapAssignment.fromKeyFrequency( + downstreamParallelism, + completedStatistics.keyFrequency(), + closeFileCostWeightPercentage, + comparator)); + } + } + + @SuppressWarnings("FutureReturnValueIgnored") + private void sendGlobalStatisticsToSubtasks(GlobalStatistics statistics) { + runInCoordinatorThread( + () -> { + LOG.info( + "Broadcast latest global statistics from checkpoint {} to all subtasks", + statistics.checkpointId()); + // applyImmediately is set to false so that operator subtasks can + // apply the change at checkpoint boundary + StatisticsEvent statisticsEvent = + StatisticsEvent.createGlobalStatisticsEvent( + statistics, globalStatisticsSerializer, false); + for (int i = 0; i < context.currentParallelism(); ++i) { + // Ignore future return value for potential error (e.g. subtask down). + // Upon restart, subtasks send request to coordinator to refresh statistics + // if there is any difference + subtaskGateways.getSubtaskGateway(i).sendEvent(statisticsEvent); + } + }, + String.format( + Locale.ROOT, + "Failed to send operator %s coordinator global data statistics for checkpoint %d", + operatorName, + statistics.checkpointId())); + } + + @SuppressWarnings("FutureReturnValueIgnored") + private void handleRequestGlobalStatisticsEvent(int subtask, RequestGlobalStatisticsEvent event) { + if (globalStatistics != null) { + runInCoordinatorThread( + () -> { + if (event.signature() != null && event.signature() == globalStatistics.hashCode()) { + LOG.debug( + "Skip responding to statistics request from subtask {}, as the operator task already holds the same global statistics", + subtask); + } else { + LOG.info( + "Send latest global statistics from checkpoint {} to subtask {}", + globalStatistics.checkpointId(), + subtask); + StatisticsEvent statisticsEvent = + StatisticsEvent.createGlobalStatisticsEvent( + globalStatistics, globalStatisticsSerializer, true); + subtaskGateways.getSubtaskGateway(subtask).sendEvent(statisticsEvent); + } + }, + String.format( + Locale.ROOT, + "Failed to send operator %s coordinator global data statistics to requesting subtask %d for checkpoint %d", + operatorName, + subtask, + globalStatistics.checkpointId())); + } else { + LOG.info( + "Ignore global statistics request from subtask {} as statistics not available", subtask); + } + } + + @Override + public void handleEventFromOperator(int subtask, int attemptNumber, OperatorEvent event) { + runInCoordinatorThread( + () -> { + LOG.debug( + "Handling event from subtask {} (#{}) of {}: {}", + subtask, + attemptNumber, + operatorName, + event); + if (event instanceof StatisticsEvent) { + handleDataStatisticRequest(subtask, ((StatisticsEvent) event)); + } else if (event instanceof RequestGlobalStatisticsEvent) { + handleRequestGlobalStatisticsEvent(subtask, (RequestGlobalStatisticsEvent) event); + } else { + throw new IllegalArgumentException( + "Invalid operator event type: " + event.getClass().getCanonicalName()); + } + }, + String.format( + Locale.ROOT, + "handling operator event %s from subtask %d (#%d)", + event.getClass(), + subtask, + attemptNumber)); + } + + @Override + public void checkpointCoordinator(long checkpointId, CompletableFuture resultFuture) { + runInCoordinatorThread( + () -> { + LOG.debug( + "Snapshotting data statistics coordinator {} for checkpoint {}", + operatorName, + checkpointId); + if (completedStatistics == null) { + // null checkpoint result is not allowed, hence supply an empty byte array + resultFuture.complete(new byte[0]); + } else { + resultFuture.complete( + StatisticsUtil.serializeCompletedStatistics( + completedStatistics, completedStatisticsSerializer)); + } + }, + String.format(Locale.ROOT, "taking checkpoint %d", checkpointId)); + } + + @Override + public void notifyCheckpointComplete(long checkpointId) {} + + @Override + public void resetToCheckpoint(long checkpointId, byte[] checkpointData) { + Preconditions.checkState( + !started, "The coordinator %s can only be reset if it was not yet started", operatorName); + if (checkpointData == null || checkpointData.length == 0) { + LOG.info( + "Data statistic coordinator {} has nothing to restore from checkpoint {}", + operatorName, + checkpointId); + return; + } + + LOG.info( + "Restoring data statistic coordinator {} from checkpoint {}", operatorName, checkpointId); + this.completedStatistics = + StatisticsUtil.deserializeCompletedStatistics( + checkpointData, (CompletedStatisticsSerializer) completedStatisticsSerializer); + + // recompute global statistics in case downstream parallelism changed + this.globalStatistics = + globalStatistics( + completedStatistics, downstreamParallelism, comparator, closeFileCostWeightPercentage); + } + + @Override + public void subtaskReset(int subtask, long checkpointId) { + runInCoordinatorThread( + () -> { + LOG.info( + "Operator {} subtask {} is reset to checkpoint {}", + operatorName, + subtask, + checkpointId); + Preconditions.checkState( + this.coordinatorThreadFactory.isCurrentThreadCoordinatorThread()); + subtaskGateways.reset(subtask); + }, + String.format( + Locale.ROOT, "handling subtask %d recovery to checkpoint %d", subtask, checkpointId)); + } + + @Override + public void executionAttemptFailed(int subtask, int attemptNumber, @Nullable Throwable reason) { + runInCoordinatorThread( + () -> { + LOG.info( + "Unregistering gateway after failure for subtask {} (#{}) of data statistics {}", + subtask, + attemptNumber, + operatorName); + Preconditions.checkState( + this.coordinatorThreadFactory.isCurrentThreadCoordinatorThread()); + subtaskGateways.unregisterSubtaskGateway(subtask, attemptNumber); + }, + String.format(Locale.ROOT, "handling subtask %d (#%d) failure", subtask, attemptNumber)); + } + + @Override + public void executionAttemptReady(int subtask, int attemptNumber, SubtaskGateway gateway) { + Preconditions.checkArgument(subtask == gateway.getSubtask()); + Preconditions.checkArgument(attemptNumber == gateway.getExecution().getAttemptNumber()); + runInCoordinatorThread( + () -> { + Preconditions.checkState( + this.coordinatorThreadFactory.isCurrentThreadCoordinatorThread()); + subtaskGateways.registerSubtaskGateway(gateway); + }, + String.format( + Locale.ROOT, + "making event gateway to subtask %d (#%d) available", + subtask, + attemptNumber)); + } + + @VisibleForTesting + CompletedStatistics completedStatistics() { + return completedStatistics; + } + + @VisibleForTesting + GlobalStatistics globalStatistics() { + return globalStatistics; + } + + private static class SubtaskGateways { + private final String operatorName; + private final Map[] gateways; + + @SuppressWarnings("unchecked") + private SubtaskGateways(String operatorName, int parallelism) { + this.operatorName = operatorName; + gateways = new Map[parallelism]; + + for (int i = 0; i < parallelism; ++i) { + gateways[i] = Maps.newHashMap(); + } + } + + private void registerSubtaskGateway(OperatorCoordinator.SubtaskGateway gateway) { + int subtaskIndex = gateway.getSubtask(); + int attemptNumber = gateway.getExecution().getAttemptNumber(); + Preconditions.checkState( + !gateways[subtaskIndex].containsKey(attemptNumber), + "Coordinator of %s already has a subtask gateway for %d (#%d)", + operatorName, + subtaskIndex, + attemptNumber); + LOG.debug( + "Coordinator of {} registers gateway for subtask {} attempt {}", + operatorName, + subtaskIndex, + attemptNumber); + gateways[subtaskIndex].put(attemptNumber, gateway); + } + + private void unregisterSubtaskGateway(int subtaskIndex, int attemptNumber) { + LOG.debug( + "Coordinator of {} unregisters gateway for subtask {} attempt {}", + operatorName, + subtaskIndex, + attemptNumber); + gateways[subtaskIndex].remove(attemptNumber); + } + + private OperatorCoordinator.SubtaskGateway getSubtaskGateway(int subtaskIndex) { + Preconditions.checkState( + !gateways[subtaskIndex].isEmpty(), + "Coordinator of %s subtask %d is not ready yet to receive events", + operatorName, + subtaskIndex); + return Iterables.getOnlyElement(gateways[subtaskIndex].values()); + } + + private void reset(int subtaskIndex) { + gateways[subtaskIndex].clear(); + } + } + + private static class CoordinatorExecutorThreadFactory + implements ThreadFactory, Thread.UncaughtExceptionHandler { + + private final String coordinatorThreadName; + private final ClassLoader classLoader; + private final Thread.UncaughtExceptionHandler errorHandler; + + @javax.annotation.Nullable private Thread thread; + + CoordinatorExecutorThreadFactory( + final String coordinatorThreadName, final ClassLoader contextClassLoader) { + this(coordinatorThreadName, contextClassLoader, FatalExitExceptionHandler.INSTANCE); + } + + @org.apache.flink.annotation.VisibleForTesting + CoordinatorExecutorThreadFactory( + final String coordinatorThreadName, + final ClassLoader contextClassLoader, + final Thread.UncaughtExceptionHandler errorHandler) { + this.coordinatorThreadName = coordinatorThreadName; + this.classLoader = contextClassLoader; + this.errorHandler = errorHandler; + } + + @Override + public synchronized Thread newThread(@NotNull Runnable runnable) { + thread = new Thread(runnable, coordinatorThreadName); + thread.setContextClassLoader(classLoader); + thread.setUncaughtExceptionHandler(this); + return thread; + } + + @Override + public synchronized void uncaughtException(Thread t, Throwable e) { + errorHandler.uncaughtException(t, e); + } + + boolean isCurrentThreadCoordinatorThread() { + return Thread.currentThread() == thread; + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java new file mode 100644 index 000000000000..9d7d989c298e --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; +import org.apache.flink.runtime.operators.coordination.RecreateOnResetOperatorCoordinator; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; + +/** + * DataStatisticsCoordinatorProvider provides the method to create new {@link + * DataStatisticsCoordinator} + */ +@Internal +public class DataStatisticsCoordinatorProvider extends RecreateOnResetOperatorCoordinator.Provider { + + private final String operatorName; + private final Schema schema; + private final SortOrder sortOrder; + private final int downstreamParallelism; + private final StatisticsType type; + private final double closeFileCostWeightPercentage; + + public DataStatisticsCoordinatorProvider( + String operatorName, + OperatorID operatorID, + Schema schema, + SortOrder sortOrder, + int downstreamParallelism, + StatisticsType type, + double closeFileCostWeightPercentage) { + super(operatorID); + this.operatorName = operatorName; + this.schema = schema; + this.sortOrder = sortOrder; + this.downstreamParallelism = downstreamParallelism; + this.type = type; + this.closeFileCostWeightPercentage = closeFileCostWeightPercentage; + } + + @Override + public OperatorCoordinator getCoordinator(OperatorCoordinator.Context context) { + return new DataStatisticsCoordinator( + operatorName, + context, + schema, + sortOrder, + downstreamParallelism, + type, + closeFileCostWeightPercentage); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java new file mode 100644 index 000000000000..a873136c9194 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java @@ -0,0 +1,269 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Map; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.runtime.operators.coordination.OperatorEventGateway; +import org.apache.flink.runtime.operators.coordination.OperatorEventHandler; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.apache.flink.runtime.state.StateSnapshotContext; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.operators.StreamOperatorParameters; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * DataStatisticsOperator collects traffic distribution statistics. A custom partitioner shall be + * attached to the DataStatisticsOperator output. The custom partitioner leverages the statistics to + * shuffle record to improve data clustering while maintaining relative balanced traffic + * distribution to downstream subtasks. + */ +@Internal +public class DataStatisticsOperator extends AbstractStreamOperator + implements OneInputStreamOperator, OperatorEventHandler { + + private static final long serialVersionUID = 1L; + + private final String operatorName; + private final RowDataWrapper rowDataWrapper; + private final SortKey sortKey; + private final OperatorEventGateway operatorEventGateway; + private final int downstreamParallelism; + private final StatisticsType statisticsType; + private final TypeSerializer taskStatisticsSerializer; + private final TypeSerializer globalStatisticsSerializer; + + private transient int parallelism; + private transient int subtaskIndex; + private transient ListState globalStatisticsState; + // current statistics type may be different from the config due to possible + // migration from Map statistics to Sketch statistics when high cardinality detected + private transient volatile StatisticsType taskStatisticsType; + private transient volatile DataStatistics localStatistics; + private transient volatile GlobalStatistics globalStatistics; + + DataStatisticsOperator( + StreamOperatorParameters parameters, + String operatorName, + Schema schema, + SortOrder sortOrder, + OperatorEventGateway operatorEventGateway, + int downstreamParallelism, + StatisticsType statisticsType) { + super(parameters); + this.operatorName = operatorName; + this.rowDataWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); + this.sortKey = new SortKey(schema, sortOrder); + this.operatorEventGateway = operatorEventGateway; + this.downstreamParallelism = downstreamParallelism; + this.statisticsType = statisticsType; + + SortKeySerializer sortKeySerializer = new SortKeySerializer(schema, sortOrder); + this.taskStatisticsSerializer = new DataStatisticsSerializer(sortKeySerializer); + this.globalStatisticsSerializer = new GlobalStatisticsSerializer(sortKeySerializer); + } + + @Override + public void initializeState(StateInitializationContext context) throws Exception { + this.parallelism = getRuntimeContext().getTaskInfo().getNumberOfParallelSubtasks(); + this.subtaskIndex = getRuntimeContext().getTaskInfo().getIndexOfThisSubtask(); + + // Use union state so that new subtasks can also restore global statistics during scale-up. + this.globalStatisticsState = + context + .getOperatorStateStore() + .getUnionListState( + new ListStateDescriptor<>("globalStatisticsState", globalStatisticsSerializer)); + + if (context.isRestored()) { + if (globalStatisticsState.get() == null + || !globalStatisticsState.get().iterator().hasNext()) { + LOG.info( + "Operator {} subtask {} doesn't have global statistics state to restore", + operatorName, + subtaskIndex); + // If Flink deprecates union state in the future, RequestGlobalStatisticsEvent can be + // leveraged to request global statistics from coordinator if new subtasks (scale-up case) + // has nothing to restore from. + } else { + GlobalStatistics restoredStatistics = globalStatisticsState.get().iterator().next(); + LOG.info( + "Operator {} subtask {} restored global statistics state", operatorName, subtaskIndex); + this.globalStatistics = restoredStatistics; + } + + // Always request for new statistics from coordinator upon task initialization. + // There are a few scenarios this is needed + // 1. downstream writer parallelism changed due to rescale. + // 2. coordinator failed to send the aggregated statistics to subtask + // (e.g. due to subtask failure at the time). + // Records may flow before coordinator can respond. Range partitioner should be + // able to continue to operate with potentially suboptimal behavior (in sketch case). + LOG.info( + "Operator {} subtask {} requests new global statistics from coordinator ", + operatorName, + subtaskIndex); + // coordinator can use the hashCode (if available) in the request event to determine + // if operator already has the latest global statistics and respond can be skipped. + // This makes the handling cheap in most situations. + RequestGlobalStatisticsEvent event = + globalStatistics != null + ? new RequestGlobalStatisticsEvent(globalStatistics.hashCode()) + : new RequestGlobalStatisticsEvent(); + operatorEventGateway.sendEventToCoordinator(event); + } + + this.taskStatisticsType = StatisticsUtil.collectType(statisticsType, globalStatistics); + this.localStatistics = + StatisticsUtil.createTaskStatistics(taskStatisticsType, parallelism, downstreamParallelism); + } + + @Override + public void open() throws Exception { + if (globalStatistics != null) { + output.collect(new StreamRecord<>(StatisticsOrRecord.fromStatistics(globalStatistics))); + } + } + + @Override + public void handleOperatorEvent(OperatorEvent event) { + Preconditions.checkArgument( + event instanceof StatisticsEvent, + String.format( + "Operator %s subtask %s received unexpected operator event %s", + operatorName, subtaskIndex, event.getClass())); + StatisticsEvent statisticsEvent = (StatisticsEvent) event; + LOG.info( + "Operator {} subtask {} received global data event from coordinator checkpoint {}", + operatorName, + subtaskIndex, + statisticsEvent.checkpointId()); + this.globalStatistics = + StatisticsUtil.deserializeGlobalStatistics( + statisticsEvent.statisticsBytes(), globalStatisticsSerializer); + checkStatisticsTypeMigration(); + // if applyImmediately not set, wait until the checkpoint time to switch + if (statisticsEvent.applyImmediately()) { + output.collect(new StreamRecord<>(StatisticsOrRecord.fromStatistics(globalStatistics))); + } + } + + @Override + public void processElement(StreamRecord streamRecord) { + // collect data statistics + RowData record = streamRecord.getValue(); + StructLike struct = rowDataWrapper.wrap(record); + sortKey.wrap(struct); + localStatistics.add(sortKey); + + checkStatisticsTypeMigration(); + output.collect(new StreamRecord<>(StatisticsOrRecord.fromRecord(record))); + } + + @Override + public void snapshotState(StateSnapshotContext context) throws Exception { + long checkpointId = context.getCheckpointId(); + LOG.info( + "Operator {} subtask {} snapshotting data statistics for checkpoint {}", + operatorName, + subtaskIndex, + checkpointId); + + // Pass global statistics to partitioner so that all the operators refresh statistics + // at same checkpoint barrier + if (globalStatistics != null) { + output.collect(new StreamRecord<>(StatisticsOrRecord.fromStatistics(globalStatistics))); + } + + // Only subtask 0 saves the state so that globalStatisticsState(UnionListState) stores + // an exact copy of globalStatistics + if (globalStatistics != null + && getRuntimeContext().getTaskInfo().getIndexOfThisSubtask() == 0) { + globalStatisticsState.clear(); + LOG.info( + "Operator {} subtask {} saving global statistics to state", operatorName, subtaskIndex); + globalStatisticsState.add(globalStatistics); + LOG.debug( + "Operator {} subtask {} saved global statistics to state: {}", + operatorName, + subtaskIndex, + globalStatistics); + } + + // For now, local statistics are sent to coordinator at checkpoint + LOG.info( + "Operator {} Subtask {} sending local statistics to coordinator for checkpoint {}", + operatorName, + subtaskIndex, + checkpointId); + operatorEventGateway.sendEventToCoordinator( + StatisticsEvent.createTaskStatisticsEvent( + checkpointId, localStatistics, taskStatisticsSerializer)); + + // Recreate the local statistics + localStatistics = + StatisticsUtil.createTaskStatistics(taskStatisticsType, parallelism, downstreamParallelism); + } + + @SuppressWarnings("unchecked") + private void checkStatisticsTypeMigration() { + // only check if the statisticsType config is Auto and localStatistics is currently Map type + if (statisticsType == StatisticsType.Auto && localStatistics.type() == StatisticsType.Map) { + Map mapStatistics = (Map) localStatistics.result(); + // convert if local statistics has cardinality over the threshold or + // if received global statistics is already sketch type + if (mapStatistics.size() > SketchUtil.OPERATOR_SKETCH_SWITCH_THRESHOLD + || (globalStatistics != null && globalStatistics.type() == StatisticsType.Sketch)) { + LOG.info( + "Operator {} subtask {} switched local statistics from Map to Sketch.", + operatorName, + subtaskIndex); + this.taskStatisticsType = StatisticsType.Sketch; + this.localStatistics = + StatisticsUtil.createTaskStatistics( + taskStatisticsType, parallelism, downstreamParallelism); + SketchUtil.convertMapToSketch(mapStatistics, localStatistics::add); + } + } + } + + @VisibleForTesting + DataStatistics localStatistics() { + return localStatistics; + } + + @VisibleForTesting + GlobalStatistics globalStatistics() { + return globalStatistics; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperatorFactory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperatorFactory.java new file mode 100644 index 000000000000..7ece89fdcc69 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperatorFactory.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; +import org.apache.flink.runtime.operators.coordination.OperatorEventGateway; +import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.CoordinatedOperatorFactory; +import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.StreamOperator; +import org.apache.flink.streaming.api.operators.StreamOperatorParameters; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; + +@Internal +public class DataStatisticsOperatorFactory extends AbstractStreamOperatorFactory + implements CoordinatedOperatorFactory, + OneInputStreamOperatorFactory { + + private final Schema schema; + private final SortOrder sortOrder; + private final int downstreamParallelism; + private final StatisticsType type; + private final double closeFileCostWeightPercentage; + + public DataStatisticsOperatorFactory( + Schema schema, + SortOrder sortOrder, + int downstreamParallelism, + StatisticsType type, + double closeFileCostWeightPercentage) { + this.schema = schema; + this.sortOrder = sortOrder; + this.downstreamParallelism = downstreamParallelism; + this.type = type; + this.closeFileCostWeightPercentage = closeFileCostWeightPercentage; + } + + @Override + public OperatorCoordinator.Provider getCoordinatorProvider( + String operatorName, OperatorID operatorID) { + return new DataStatisticsCoordinatorProvider( + operatorName, + operatorID, + schema, + sortOrder, + downstreamParallelism, + type, + closeFileCostWeightPercentage); + } + + @SuppressWarnings("unchecked") + @Override + public > T createStreamOperator( + StreamOperatorParameters parameters) { + OperatorID operatorId = parameters.getStreamConfig().getOperatorID(); + String operatorName = parameters.getStreamConfig().getOperatorName(); + OperatorEventGateway gateway = + parameters.getOperatorEventDispatcher().getOperatorEventGateway(operatorId); + + DataStatisticsOperator rangeStatisticsOperator = + new DataStatisticsOperator( + parameters, operatorName, schema, sortOrder, gateway, downstreamParallelism, type); + + parameters + .getOperatorEventDispatcher() + .registerEventHandler(operatorId, rangeStatisticsOperator); + + return (T) rangeStatisticsOperator; + } + + @SuppressWarnings("rawtypes") + @Override + public Class getStreamOperatorClass(ClassLoader classLoader) { + return DataStatisticsOperator.class; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java new file mode 100644 index 000000000000..8ce99073836d --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.io.IOException; +import java.util.Map; +import java.util.Objects; +import org.apache.datasketches.memory.Memory; +import org.apache.datasketches.sampling.ReservoirItemsSketch; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; +import org.apache.flink.api.common.typeutils.base.EnumSerializer; +import org.apache.flink.api.common.typeutils.base.LongSerializer; +import org.apache.flink.api.common.typeutils.base.MapSerializer; +import org.apache.flink.core.memory.DataInputView; +import org.apache.flink.core.memory.DataOutputView; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +@Internal +class DataStatisticsSerializer extends TypeSerializer { + private final TypeSerializer sortKeySerializer; + private final EnumSerializer statisticsTypeSerializer; + private final MapSerializer mapSerializer; + private final SortKeySketchSerializer sketchSerializer; + + DataStatisticsSerializer(TypeSerializer sortKeySerializer) { + this.sortKeySerializer = sortKeySerializer; + this.statisticsTypeSerializer = new EnumSerializer<>(StatisticsType.class); + this.mapSerializer = new MapSerializer<>(sortKeySerializer, LongSerializer.INSTANCE); + this.sketchSerializer = new SortKeySketchSerializer(sortKeySerializer); + } + + @Override + public boolean isImmutableType() { + return false; + } + + @SuppressWarnings("ReferenceEquality") + @Override + public TypeSerializer duplicate() { + TypeSerializer duplicateSortKeySerializer = sortKeySerializer.duplicate(); + return (duplicateSortKeySerializer == sortKeySerializer) + ? this + : new DataStatisticsSerializer(duplicateSortKeySerializer); + } + + @Override + public DataStatistics createInstance() { + return new MapDataStatistics(); + } + + @SuppressWarnings("unchecked") + @Override + public DataStatistics copy(DataStatistics obj) { + StatisticsType statisticsType = obj.type(); + if (statisticsType == StatisticsType.Map) { + MapDataStatistics from = (MapDataStatistics) obj; + Map fromStats = (Map) from.result(); + Map toStats = Maps.newHashMap(fromStats); + return new MapDataStatistics(toStats); + } else if (statisticsType == StatisticsType.Sketch) { + // because ReservoirItemsSketch doesn't expose enough public methods for cloning, + // this implementation adopted the less efficient serialization and deserialization. + SketchDataStatistics from = (SketchDataStatistics) obj; + ReservoirItemsSketch fromStats = (ReservoirItemsSketch) from.result(); + byte[] bytes = fromStats.toByteArray(sketchSerializer); + Memory memory = Memory.wrap(bytes); + ReservoirItemsSketch toStats = + ReservoirItemsSketch.heapify(memory, sketchSerializer); + return new SketchDataStatistics(toStats); + } else { + throw new IllegalArgumentException("Unsupported data statistics type: " + statisticsType); + } + } + + @Override + public DataStatistics copy(DataStatistics from, DataStatistics reuse) { + // not much benefit to reuse + return copy(from); + } + + @Override + public int getLength() { + return -1; + } + + @SuppressWarnings("unchecked") + @Override + public void serialize(DataStatistics obj, DataOutputView target) throws IOException { + StatisticsType statisticsType = obj.type(); + statisticsTypeSerializer.serialize(obj.type(), target); + if (statisticsType == StatisticsType.Map) { + Map mapStatistics = (Map) obj.result(); + mapSerializer.serialize(mapStatistics, target); + } else if (statisticsType == StatisticsType.Sketch) { + ReservoirItemsSketch sketch = (ReservoirItemsSketch) obj.result(); + byte[] sketchBytes = sketch.toByteArray(sketchSerializer); + target.writeInt(sketchBytes.length); + target.write(sketchBytes); + } else { + throw new IllegalArgumentException("Unsupported data statistics type: " + statisticsType); + } + } + + @Override + public DataStatistics deserialize(DataInputView source) throws IOException { + StatisticsType statisticsType = statisticsTypeSerializer.deserialize(source); + if (statisticsType == StatisticsType.Map) { + Map mapStatistics = mapSerializer.deserialize(source); + return new MapDataStatistics(mapStatistics); + } else if (statisticsType == StatisticsType.Sketch) { + int numBytes = source.readInt(); + byte[] sketchBytes = new byte[numBytes]; + source.read(sketchBytes); + Memory sketchMemory = Memory.wrap(sketchBytes); + ReservoirItemsSketch sketch = + ReservoirItemsSketch.heapify(sketchMemory, sketchSerializer); + return new SketchDataStatistics(sketch); + } else { + throw new IllegalArgumentException("Unsupported data statistics type: " + statisticsType); + } + } + + @Override + public DataStatistics deserialize(DataStatistics reuse, DataInputView source) throws IOException { + // not much benefit to reuse + return deserialize(source); + } + + @Override + public void copy(DataInputView source, DataOutputView target) throws IOException { + serialize(deserialize(source), target); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof DataStatisticsSerializer)) { + return false; + } + + DataStatisticsSerializer other = (DataStatisticsSerializer) obj; + return Objects.equals(sortKeySerializer, other.sortKeySerializer); + } + + @Override + public int hashCode() { + return sortKeySerializer.hashCode(); + } + + @Override + public TypeSerializerSnapshot snapshotConfiguration() { + return new DataStatisticsSerializerSnapshot(this); + } + + public static class DataStatisticsSerializerSnapshot + extends CompositeTypeSerializerSnapshot { + private static final int CURRENT_VERSION = 1; + + /** Constructor for read instantiation. */ + @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) + public DataStatisticsSerializerSnapshot() {} + + @SuppressWarnings("checkstyle:RedundantModifier") + public DataStatisticsSerializerSnapshot(DataStatisticsSerializer serializer) { + super(serializer); + } + + @Override + protected int getCurrentOuterSnapshotVersion() { + return CURRENT_VERSION; + } + + @Override + protected TypeSerializer[] getNestedSerializers(DataStatisticsSerializer outerSerializer) { + return new TypeSerializer[] {outerSerializer.sortKeySerializer}; + } + + @Override + protected DataStatisticsSerializer createOuterSerializerWithNestedSerializers( + TypeSerializer[] nestedSerializers) { + SortKeySerializer sortKeySerializer = (SortKeySerializer) nestedSerializers[0]; + return new DataStatisticsSerializer(sortKeySerializer); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java new file mode 100644 index 000000000000..50ec23e9f7a2 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Arrays; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Objects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * This is used by {@link RangePartitioner} for guiding range partitioning. This is what is sent to + * the operator subtasks. For sketch statistics, it only contains much smaller range bounds than the + * complete raw samples. + */ +class GlobalStatistics { + private final long checkpointId; + private final StatisticsType type; + private final MapAssignment mapAssignment; + private final SortKey[] rangeBounds; + + private transient Integer hashCode; + + GlobalStatistics( + long checkpointId, StatisticsType type, MapAssignment mapAssignment, SortKey[] rangeBounds) { + Preconditions.checkArgument( + (mapAssignment != null && rangeBounds == null) + || (mapAssignment == null && rangeBounds != null), + "Invalid key assignment or range bounds: both are non-null or null"); + this.checkpointId = checkpointId; + this.type = type; + this.mapAssignment = mapAssignment; + this.rangeBounds = rangeBounds; + } + + static GlobalStatistics fromMapAssignment(long checkpointId, MapAssignment mapAssignment) { + return new GlobalStatistics(checkpointId, StatisticsType.Map, mapAssignment, null); + } + + static GlobalStatistics fromRangeBounds(long checkpointId, SortKey[] rangeBounds) { + return new GlobalStatistics(checkpointId, StatisticsType.Sketch, null, rangeBounds); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("checkpointId", checkpointId) + .add("type", type) + .add("mapAssignment", mapAssignment) + .add("rangeBounds", rangeBounds) + .toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (!(o instanceof GlobalStatistics)) { + return false; + } + + GlobalStatistics other = (GlobalStatistics) o; + return Objects.equal(checkpointId, other.checkpointId) + && Objects.equal(type, other.type) + && Objects.equal(mapAssignment, other.mapAssignment()) + && Arrays.equals(rangeBounds, other.rangeBounds()); + } + + @Override + public int hashCode() { + // implemented caching because coordinator can call the hashCode many times. + // when subtasks request statistics refresh upon initialization for reconciliation purpose, + // hashCode is used to check if there is any difference btw coordinator and operator state. + if (hashCode == null) { + this.hashCode = Objects.hashCode(checkpointId, type, mapAssignment, rangeBounds); + } + + return hashCode; + } + + long checkpointId() { + return checkpointId; + } + + StatisticsType type() { + return type; + } + + MapAssignment mapAssignment() { + return mapAssignment; + } + + SortKey[] rangeBounds() { + return rangeBounds; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java new file mode 100644 index 000000000000..a7fe2b30b865 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; +import org.apache.flink.api.common.typeutils.base.EnumSerializer; +import org.apache.flink.api.common.typeutils.base.IntSerializer; +import org.apache.flink.api.common.typeutils.base.ListSerializer; +import org.apache.flink.api.common.typeutils.base.LongSerializer; +import org.apache.flink.core.memory.DataInputView; +import org.apache.flink.core.memory.DataOutputView; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +class GlobalStatisticsSerializer extends TypeSerializer { + private final TypeSerializer sortKeySerializer; + private final EnumSerializer statisticsTypeSerializer; + private final ListSerializer rangeBoundsSerializer; + private final ListSerializer intsSerializer; + private final ListSerializer longsSerializer; + + GlobalStatisticsSerializer(TypeSerializer sortKeySerializer) { + this.sortKeySerializer = sortKeySerializer; + this.statisticsTypeSerializer = new EnumSerializer<>(StatisticsType.class); + this.rangeBoundsSerializer = new ListSerializer<>(sortKeySerializer); + this.intsSerializer = new ListSerializer<>(IntSerializer.INSTANCE); + this.longsSerializer = new ListSerializer<>(LongSerializer.INSTANCE); + } + + @Override + public boolean isImmutableType() { + return false; + } + + @Override + public TypeSerializer duplicate() { + return new GlobalStatisticsSerializer(sortKeySerializer); + } + + @Override + public GlobalStatistics createInstance() { + return GlobalStatistics.fromRangeBounds(0L, new SortKey[0]); + } + + @Override + public GlobalStatistics copy(GlobalStatistics from) { + return new GlobalStatistics( + from.checkpointId(), from.type(), from.mapAssignment(), from.rangeBounds()); + } + + @Override + public GlobalStatistics copy(GlobalStatistics from, GlobalStatistics reuse) { + // no benefit of reuse + return copy(from); + } + + @Override + public int getLength() { + return -1; + } + + @Override + public void serialize(GlobalStatistics record, DataOutputView target) throws IOException { + target.writeLong(record.checkpointId()); + statisticsTypeSerializer.serialize(record.type(), target); + if (record.type() == StatisticsType.Map) { + MapAssignment mapAssignment = record.mapAssignment(); + target.writeInt(mapAssignment.numPartitions()); + target.writeInt(mapAssignment.keyAssignments().size()); + for (Map.Entry entry : mapAssignment.keyAssignments().entrySet()) { + sortKeySerializer.serialize(entry.getKey(), target); + KeyAssignment keyAssignment = entry.getValue(); + intsSerializer.serialize(keyAssignment.assignedSubtasks(), target); + longsSerializer.serialize(keyAssignment.subtaskWeightsWithCloseFileCost(), target); + target.writeLong(keyAssignment.closeFileCostWeight()); + } + } else { + rangeBoundsSerializer.serialize(Arrays.asList(record.rangeBounds()), target); + } + } + + @Override + public GlobalStatistics deserialize(DataInputView source) throws IOException { + long checkpointId = source.readLong(); + StatisticsType type = statisticsTypeSerializer.deserialize(source); + if (type == StatisticsType.Map) { + int numPartitions = source.readInt(); + int mapSize = source.readInt(); + Map keyAssignments = Maps.newHashMapWithExpectedSize(mapSize); + for (int i = 0; i < mapSize; ++i) { + SortKey sortKey = sortKeySerializer.deserialize(source); + List assignedSubtasks = intsSerializer.deserialize(source); + List subtaskWeightsWithCloseFileCost = longsSerializer.deserialize(source); + long closeFileCostWeight = source.readLong(); + keyAssignments.put( + sortKey, + new KeyAssignment( + assignedSubtasks, subtaskWeightsWithCloseFileCost, closeFileCostWeight)); + } + + return GlobalStatistics.fromMapAssignment( + checkpointId, new MapAssignment(numPartitions, keyAssignments)); + } else { + List sortKeys = rangeBoundsSerializer.deserialize(source); + SortKey[] rangeBounds = new SortKey[sortKeys.size()]; + return GlobalStatistics.fromRangeBounds(checkpointId, sortKeys.toArray(rangeBounds)); + } + } + + @Override + public GlobalStatistics deserialize(GlobalStatistics reuse, DataInputView source) + throws IOException { + // not much benefit to reuse + return deserialize(source); + } + + @Override + public void copy(DataInputView source, DataOutputView target) throws IOException { + serialize(deserialize(source), target); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + + if (obj == null || getClass() != obj.getClass()) { + return false; + } + + GlobalStatisticsSerializer other = (GlobalStatisticsSerializer) obj; + return Objects.equals(sortKeySerializer, other.sortKeySerializer); + } + + @Override + public int hashCode() { + return sortKeySerializer.hashCode(); + } + + @Override + public TypeSerializerSnapshot snapshotConfiguration() { + return new GlobalStatisticsSerializerSnapshot(this); + } + + public static class GlobalStatisticsSerializerSnapshot + extends CompositeTypeSerializerSnapshot { + private static final int CURRENT_VERSION = 1; + + /** Constructor for read instantiation. */ + @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) + public GlobalStatisticsSerializerSnapshot() {} + + @SuppressWarnings("checkstyle:RedundantModifier") + public GlobalStatisticsSerializerSnapshot(GlobalStatisticsSerializer serializer) { + super(serializer); + } + + @Override + protected int getCurrentOuterSnapshotVersion() { + return CURRENT_VERSION; + } + + @Override + protected TypeSerializer[] getNestedSerializers(GlobalStatisticsSerializer outerSerializer) { + return new TypeSerializer[] {outerSerializer.sortKeySerializer}; + } + + @Override + protected GlobalStatisticsSerializer createOuterSerializerWithNestedSerializers( + TypeSerializer[] nestedSerializers) { + SortKeySerializer sortKeySerializer = (SortKeySerializer) nestedSerializers[0]; + return new GlobalStatisticsSerializer(sortKeySerializer); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java new file mode 100644 index 000000000000..781bcc646023 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.concurrent.ThreadLocalRandom; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** Subtask assignment for a key for Map statistics based */ +class KeyAssignment { + private final List assignedSubtasks; + private final List subtaskWeightsWithCloseFileCost; + private final long closeFileCostWeight; + private final long[] subtaskWeightsExcludingCloseCost; + private final long keyWeight; + private final long[] cumulativeWeights; + + /** + * @param assignedSubtasks assigned subtasks for this key. It could be a single subtask. It could + * also be multiple subtasks if the key has heavy weight that should be handled by multiple + * subtasks. + * @param subtaskWeightsWithCloseFileCost assigned weight for each subtask. E.g., if the keyWeight + * is 27 and the key is assigned to 3 subtasks, subtaskWeights could contain values as [10, + * 10, 7] for target weight of 10 per subtask. + */ + KeyAssignment( + List assignedSubtasks, + List subtaskWeightsWithCloseFileCost, + long closeFileCostWeight) { + Preconditions.checkArgument( + assignedSubtasks != null && !assignedSubtasks.isEmpty(), + "Invalid assigned subtasks: null or empty"); + Preconditions.checkArgument( + subtaskWeightsWithCloseFileCost != null && !subtaskWeightsWithCloseFileCost.isEmpty(), + "Invalid assigned subtasks weights: null or empty"); + Preconditions.checkArgument( + assignedSubtasks.size() == subtaskWeightsWithCloseFileCost.size(), + "Invalid assignment: size mismatch (tasks length = %s, weights length = %s)", + assignedSubtasks.size(), + subtaskWeightsWithCloseFileCost.size()); + subtaskWeightsWithCloseFileCost.forEach( + weight -> + Preconditions.checkArgument( + weight > closeFileCostWeight, + "Invalid weight: should be larger than close file cost: weight = %s, close file cost = %s", + weight, + closeFileCostWeight)); + + this.assignedSubtasks = assignedSubtasks; + this.subtaskWeightsWithCloseFileCost = subtaskWeightsWithCloseFileCost; + this.closeFileCostWeight = closeFileCostWeight; + // Exclude the close file cost for key routing + this.subtaskWeightsExcludingCloseCost = + subtaskWeightsWithCloseFileCost.stream() + .mapToLong(weightWithCloseFileCost -> weightWithCloseFileCost - closeFileCostWeight) + .toArray(); + this.keyWeight = Arrays.stream(subtaskWeightsExcludingCloseCost).sum(); + this.cumulativeWeights = new long[subtaskWeightsExcludingCloseCost.length]; + long cumulativeWeight = 0; + for (int i = 0; i < subtaskWeightsExcludingCloseCost.length; ++i) { + cumulativeWeight += subtaskWeightsExcludingCloseCost[i]; + cumulativeWeights[i] = cumulativeWeight; + } + } + + List assignedSubtasks() { + return assignedSubtasks; + } + + List subtaskWeightsWithCloseFileCost() { + return subtaskWeightsWithCloseFileCost; + } + + long closeFileCostWeight() { + return closeFileCostWeight; + } + + long[] subtaskWeightsExcludingCloseCost() { + return subtaskWeightsExcludingCloseCost; + } + + /** + * Select a subtask for the key. + * + * @return subtask id + */ + int select() { + if (assignedSubtasks.size() == 1) { + // only choice. no need to run random number generator. + return assignedSubtasks.get(0); + } else { + long randomNumber = ThreadLocalRandom.current().nextLong(keyWeight); + int index = Arrays.binarySearch(cumulativeWeights, randomNumber); + // choose the subtask where randomNumber < cumulativeWeights[pos]. + // this works regardless whether index is negative or not. + int position = Math.abs(index + 1); + Preconditions.checkState( + position < assignedSubtasks.size(), + "Invalid selected position: out of range. key weight = %s, random number = %s, cumulative weights array = %s", + keyWeight, + randomNumber, + cumulativeWeights); + return assignedSubtasks.get(position); + } + } + + @Override + public int hashCode() { + return Objects.hash(assignedSubtasks, subtaskWeightsWithCloseFileCost, closeFileCostWeight); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (o == null || getClass() != o.getClass()) { + return false; + } + + KeyAssignment that = (KeyAssignment) o; + return Objects.equals(assignedSubtasks, that.assignedSubtasks) + && Objects.equals(subtaskWeightsWithCloseFileCost, that.subtaskWeightsWithCloseFileCost) + && closeFileCostWeight == that.closeFileCostWeight; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("assignedSubtasks", assignedSubtasks) + .add("subtaskWeightsWithCloseFileCost", subtaskWeightsWithCloseFileCost) + .add("closeFileCostWeight", closeFileCostWeight) + .toString(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java new file mode 100644 index 000000000000..9d8167460a1b --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java @@ -0,0 +1,242 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Objects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.Pair; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Key assignment to subtasks for Map statistics. */ +class MapAssignment { + private static final Logger LOG = LoggerFactory.getLogger(MapAssignment.class); + + private final int numPartitions; + private final Map keyAssignments; + + MapAssignment(int numPartitions, Map keyAssignments) { + Preconditions.checkArgument(keyAssignments != null, "Invalid key assignments: null"); + this.numPartitions = numPartitions; + this.keyAssignments = keyAssignments; + } + + static MapAssignment fromKeyFrequency( + int numPartitions, + Map mapStatistics, + double closeFileCostWeightPercentage, + Comparator comparator) { + return new MapAssignment( + numPartitions, + assignment(numPartitions, mapStatistics, closeFileCostWeightPercentage, comparator)); + } + + @Override + public int hashCode() { + return Objects.hashCode(numPartitions, keyAssignments); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (o == null || getClass() != o.getClass()) { + return false; + } + + MapAssignment that = (MapAssignment) o; + return numPartitions == that.numPartitions && keyAssignments.equals(that.keyAssignments); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("numPartitions", numPartitions) + .add("keyAssignments", keyAssignments) + .toString(); + } + + int numPartitions() { + return numPartitions; + } + + Map keyAssignments() { + return keyAssignments; + } + + /** + * Returns assignment summary for every subtask. + * + * @return assignment summary for every subtask. Key is subtaskId. Value pair is (weight assigned + * to the subtask, number of keys assigned to the subtask) + */ + Map> assignmentInfo() { + Map> assignmentInfo = Maps.newTreeMap(); + keyAssignments.forEach( + (key, keyAssignment) -> { + for (int i = 0; i < keyAssignment.assignedSubtasks().size(); ++i) { + int subtaskId = keyAssignment.assignedSubtasks().get(i); + long subtaskWeight = keyAssignment.subtaskWeightsExcludingCloseCost()[i]; + Pair oldValue = assignmentInfo.getOrDefault(subtaskId, Pair.of(0L, 0)); + assignmentInfo.put( + subtaskId, Pair.of(oldValue.first() + subtaskWeight, oldValue.second() + 1)); + } + }); + + return assignmentInfo; + } + + static Map assignment( + int numPartitions, + Map mapStatistics, + double closeFileCostWeightPercentage, + Comparator comparator) { + mapStatistics.forEach( + (key, value) -> + Preconditions.checkArgument( + value > 0, "Invalid statistics: weight is 0 for key %s", key)); + + long totalWeight = mapStatistics.values().stream().mapToLong(l -> l).sum(); + double targetWeightPerSubtask = ((double) totalWeight) / numPartitions; + long closeFileCostWeight = + (long) Math.ceil(targetWeightPerSubtask * closeFileCostWeightPercentage / 100); + + NavigableMap sortedStatsWithCloseFileCost = Maps.newTreeMap(comparator); + mapStatistics.forEach( + (k, v) -> { + int estimatedSplits = (int) Math.ceil(v / targetWeightPerSubtask); + long estimatedCloseFileCost = closeFileCostWeight * estimatedSplits; + sortedStatsWithCloseFileCost.put(k, v + estimatedCloseFileCost); + }); + + long totalWeightWithCloseFileCost = + sortedStatsWithCloseFileCost.values().stream().mapToLong(l -> l).sum(); + long targetWeightPerSubtaskWithCloseFileCost = + (long) Math.ceil(((double) totalWeightWithCloseFileCost) / numPartitions); + return buildAssignment( + numPartitions, + sortedStatsWithCloseFileCost, + targetWeightPerSubtaskWithCloseFileCost, + closeFileCostWeight); + } + + private static Map buildAssignment( + int numPartitions, + NavigableMap sortedStatistics, + long targetWeightPerSubtask, + long closeFileCostWeight) { + Map assignmentMap = + Maps.newHashMapWithExpectedSize(sortedStatistics.size()); + Iterator mapKeyIterator = sortedStatistics.keySet().iterator(); + int subtaskId = 0; + SortKey currentKey = null; + long keyRemainingWeight = 0L; + long subtaskRemainingWeight = targetWeightPerSubtask; + List assignedSubtasks = Lists.newArrayList(); + List subtaskWeights = Lists.newArrayList(); + while (mapKeyIterator.hasNext() || currentKey != null) { + // This should never happen because target weight is calculated using ceil function. + if (subtaskId >= numPartitions) { + LOG.error( + "Internal algorithm error: exhausted subtasks with unassigned keys left. number of partitions: {}, " + + "target weight per subtask: {}, close file cost in weight: {}, data statistics: {}", + numPartitions, + targetWeightPerSubtask, + closeFileCostWeight, + sortedStatistics); + throw new IllegalStateException( + "Internal algorithm error: exhausted subtasks with unassigned keys left"); + } + + if (currentKey == null) { + currentKey = mapKeyIterator.next(); + keyRemainingWeight = sortedStatistics.get(currentKey); + } + + assignedSubtasks.add(subtaskId); + if (keyRemainingWeight < subtaskRemainingWeight) { + // assign the remaining weight of the key to the current subtask + subtaskWeights.add(keyRemainingWeight); + subtaskRemainingWeight -= keyRemainingWeight; + keyRemainingWeight = 0L; + } else { + // filled up the current subtask + long assignedWeight = subtaskRemainingWeight; + keyRemainingWeight -= subtaskRemainingWeight; + + // If assigned weight is less than close file cost, pad it up with close file cost. + // This might cause the subtask assigned weight over the target weight. + // But it should be no more than one close file cost. Small skew is acceptable. + if (assignedWeight <= closeFileCostWeight) { + long paddingWeight = Math.min(keyRemainingWeight, closeFileCostWeight); + keyRemainingWeight -= paddingWeight; + assignedWeight += paddingWeight; + } + + subtaskWeights.add(assignedWeight); + // move on to the next subtask + subtaskId += 1; + subtaskRemainingWeight = targetWeightPerSubtask; + } + + Preconditions.checkState( + assignedSubtasks.size() == subtaskWeights.size(), + "List size mismatch: assigned subtasks = %s, subtask weights = %s", + assignedSubtasks, + subtaskWeights); + + // If the remaining key weight is smaller than the close file cost, simply skip the residual + // as it doesn't make sense to assign a weight smaller than close file cost to a new subtask. + // this might lead to some inaccuracy in weight calculation. E.g., assuming the key weight is + // 2 and close file cost is 2. key weight with close cost is 4. Let's assume the previous + // task has a weight of 3 available. So weight of 3 for this key is assigned to the task and + // the residual weight of 1 is dropped. Then the routing weight for this key is 1 (minus the + // close file cost), which is inaccurate as the true key weight should be 2. + // Again, this greedy algorithm is not intended to be perfect. Some small inaccuracy is + // expected and acceptable. Traffic distribution should still be balanced. + if (keyRemainingWeight > 0 && keyRemainingWeight <= closeFileCostWeight) { + keyRemainingWeight = 0; + } + + if (keyRemainingWeight == 0) { + // finishing up the assignment for the current key + KeyAssignment keyAssignment = + new KeyAssignment(assignedSubtasks, subtaskWeights, closeFileCostWeight); + assignmentMap.put(currentKey, keyAssignment); + assignedSubtasks = Lists.newArrayList(); + subtaskWeights = Lists.newArrayList(); + currentKey = null; + } + } + + return assignmentMap; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java new file mode 100644 index 000000000000..05b943f6046f --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Map; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Objects; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +/** MapDataStatistics uses map to count key frequency */ +class MapDataStatistics implements DataStatistics { + private final Map keyFrequency; + + MapDataStatistics() { + this.keyFrequency = Maps.newHashMap(); + } + + MapDataStatistics(Map keyFrequency) { + this.keyFrequency = keyFrequency; + } + + @Override + public StatisticsType type() { + return StatisticsType.Map; + } + + @Override + public boolean isEmpty() { + return keyFrequency.isEmpty(); + } + + @Override + public void add(SortKey sortKey) { + if (keyFrequency.containsKey(sortKey)) { + keyFrequency.merge(sortKey, 1L, Long::sum); + } else { + // clone the sort key before adding to map because input sortKey object can be reused + SortKey copiedKey = sortKey.copy(); + keyFrequency.put(copiedKey, 1L); + } + } + + @Override + public Object result() { + return keyFrequency; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this).add("map", keyFrequency).toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (!(o instanceof MapDataStatistics)) { + return false; + } + + MapDataStatistics other = (MapDataStatistics) o; + return Objects.equal(keyFrequency, other.keyFrequency); + } + + @Override + public int hashCode() { + return Objects.hashCode(keyFrequency); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java new file mode 100644 index 000000000000..f36a078c94e0 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.concurrent.TimeUnit; +import org.apache.flink.api.common.functions.Partitioner; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Internal partitioner implementation that supports MapDataStatistics, which is typically used for + * low-cardinality use cases. While MapDataStatistics can keep accurate counters, it can't be used + * for high-cardinality use cases. Otherwise, the memory footprint is too high. + * + *

It is a greedy algorithm for bin packing. With close file cost, the calculation isn't always + * precise when calculating close cost for every file, target weight per subtask, padding residual + * weight, assigned weight without close cost. + * + *

All actions should be executed in a single Flink mailbox thread. So there is no need to make + * it thread safe. + */ +class MapRangePartitioner implements Partitioner { + private static final Logger LOG = LoggerFactory.getLogger(MapRangePartitioner.class); + + private final RowDataWrapper rowDataWrapper; + private final SortKey sortKey; + private final MapAssignment mapAssignment; + + // Counter that tracks how many times a new key encountered + // where there is no traffic statistics learned about it. + private long newSortKeyCounter; + private long lastNewSortKeyLogTimeMilli; + + MapRangePartitioner(Schema schema, SortOrder sortOrder, MapAssignment mapAssignment) { + this.rowDataWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); + this.sortKey = new SortKey(schema, sortOrder); + this.mapAssignment = mapAssignment; + this.newSortKeyCounter = 0; + this.lastNewSortKeyLogTimeMilli = System.currentTimeMillis(); + } + + @Override + public int partition(RowData row, int numPartitions) { + // reuse the sortKey and rowDataWrapper + sortKey.wrap(rowDataWrapper.wrap(row)); + KeyAssignment keyAssignment = mapAssignment.keyAssignments().get(sortKey); + + int partition; + if (keyAssignment == null) { + LOG.trace( + "Encountered new sort key: {}. Fall back to round robin as statistics not learned yet.", + sortKey); + // Ideally unknownKeyCounter should be published as a counter metric. + // It seems difficult to pass in MetricGroup into the partitioner. + // Just log an INFO message every minute. + newSortKeyCounter += 1; + long now = System.currentTimeMillis(); + if (now - lastNewSortKeyLogTimeMilli > TimeUnit.MINUTES.toMillis(1)) { + LOG.info( + "Encounter new sort keys {} times. Fall back to round robin as statistics not learned yet", + newSortKeyCounter); + lastNewSortKeyLogTimeMilli = now; + newSortKeyCounter = 0; + } + partition = (int) (newSortKeyCounter % numPartitions); + } else { + partition = keyAssignment.select(); + } + + return RangePartitioner.adjustPartitionWithRescale( + partition, mapAssignment.numPartitions(), numPartitions); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java new file mode 100644 index 000000000000..6608b938f5a8 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Random; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.functions.Partitioner; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** This custom partitioner implements the {@link DistributionMode#RANGE} for Flink sink. */ +@Internal +public class RangePartitioner implements Partitioner { + private static final Logger LOG = LoggerFactory.getLogger(RangePartitioner.class); + + private final Schema schema; + private final SortOrder sortOrder; + + private transient AtomicLong roundRobinCounter; + private transient Partitioner delegatePartitioner; + + public RangePartitioner(Schema schema, SortOrder sortOrder) { + this.schema = schema; + this.sortOrder = sortOrder; + } + + @Override + public int partition(StatisticsOrRecord wrapper, int numPartitions) { + if (wrapper.hasStatistics()) { + this.delegatePartitioner = delegatePartitioner(wrapper.statistics()); + return (int) (roundRobinCounter(numPartitions).getAndIncrement() % numPartitions); + } else { + if (delegatePartitioner != null) { + return delegatePartitioner.partition(wrapper.record(), numPartitions); + } else { + int partition = (int) (roundRobinCounter(numPartitions).getAndIncrement() % numPartitions); + LOG.trace("Statistics not available. Round robin to partition {}", partition); + return partition; + } + } + } + + private AtomicLong roundRobinCounter(int numPartitions) { + if (roundRobinCounter == null) { + // randomize the starting point to avoid synchronization across subtasks + this.roundRobinCounter = new AtomicLong(new Random().nextInt(numPartitions)); + } + + return roundRobinCounter; + } + + private Partitioner delegatePartitioner(GlobalStatistics statistics) { + if (statistics.type() == StatisticsType.Map) { + return new MapRangePartitioner(schema, sortOrder, statistics.mapAssignment()); + } else if (statistics.type() == StatisticsType.Sketch) { + return new SketchRangePartitioner(schema, sortOrder, statistics.rangeBounds()); + } else { + throw new IllegalArgumentException( + String.format("Invalid statistics type: %s. Should be Map or Sketch", statistics.type())); + } + } + + /** + * Util method that handles rescale (write parallelism / numPartitions change). + * + * @param partition partition caculated based on the existing statistics + * @param numPartitionsStatsCalculation number of partitions when the assignment was calculated + * based on + * @param numPartitions current number of partitions + * @return adjusted partition if necessary. + */ + static int adjustPartitionWithRescale( + int partition, int numPartitionsStatsCalculation, int numPartitions) { + if (numPartitionsStatsCalculation <= numPartitions) { + // no rescale or scale-up case. + // new subtasks are ignored and not assigned any keys, which is sub-optimal and only + // transient. when rescale is detected, operator requests new statistics from + // coordinator upon initialization. + return partition; + } else { + // scale-down case. + // Use mod % operation to distribution the over-range partitions. + // It can cause skew among subtasks. but the behavior is still better than + // discarding the statistics and falling back to round-robin (no clustering). + // Again, this is transient and stats refresh is requested when rescale is detected. + return partition % numPartitions; + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java new file mode 100644 index 000000000000..ce94bec14860 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import org.apache.flink.runtime.operators.coordination.OperatorEvent; + +class RequestGlobalStatisticsEvent implements OperatorEvent { + private final Integer signature; + + RequestGlobalStatisticsEvent() { + this.signature = null; + } + + /** + * @param signature hashCode of the subtask's existing global statistics + */ + RequestGlobalStatisticsEvent(int signature) { + this.signature = signature; + } + + Integer signature() { + return signature; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java new file mode 100644 index 000000000000..efd87a883d78 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Arrays; +import org.apache.datasketches.sampling.ReservoirItemsSketch; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Objects; + +/** SketchDataStatistics uses reservoir sampling algorithm to count key frequency */ +class SketchDataStatistics implements DataStatistics { + + private final ReservoirItemsSketch sketch; + + SketchDataStatistics(int reservoirSize) { + this.sketch = ReservoirItemsSketch.newInstance(reservoirSize); + } + + SketchDataStatistics(ReservoirItemsSketch sketchStats) { + this.sketch = sketchStats; + } + + @Override + public StatisticsType type() { + return StatisticsType.Sketch; + } + + @Override + public boolean isEmpty() { + return sketch.getNumSamples() == 0; + } + + @Override + public void add(SortKey sortKey) { + // clone the sort key first because input sortKey object can be reused + SortKey copiedKey = sortKey.copy(); + sketch.update(copiedKey); + } + + @Override + public Object result() { + return sketch; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this).add("sketch", sketch).toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (!(o instanceof SketchDataStatistics)) { + return false; + } + + ReservoirItemsSketch otherSketch = ((SketchDataStatistics) o).sketch; + return Objects.equal(sketch.getK(), otherSketch.getK()) + && Objects.equal(sketch.getN(), otherSketch.getN()) + && Arrays.deepEquals(sketch.getSamples(), otherSketch.getSamples()); + } + + @Override + public int hashCode() { + return Objects.hashCode(sketch.getK(), sketch.getN(), sketch.getSamples()); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java new file mode 100644 index 000000000000..dddb0d8722c0 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Comparator; +import org.apache.flink.api.common.functions.Partitioner; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.types.Comparators; + +class SketchRangePartitioner implements Partitioner { + private final SortKey sortKey; + private final Comparator comparator; + private final SortKey[] rangeBounds; + private final RowDataWrapper rowDataWrapper; + + SketchRangePartitioner(Schema schema, SortOrder sortOrder, SortKey[] rangeBounds) { + this.sortKey = new SortKey(schema, sortOrder); + this.comparator = Comparators.forType(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()); + this.rangeBounds = rangeBounds; + this.rowDataWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); + } + + @Override + public int partition(RowData row, int numPartitions) { + // reuse the sortKey and rowDataWrapper + sortKey.wrap(rowDataWrapper.wrap(row)); + return SketchUtil.partition(sortKey, numPartitions, rangeBounds, comparator); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java new file mode 100644 index 000000000000..3d572b98d53f --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.function.Consumer; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +class SketchUtil { + static final int COORDINATOR_MIN_RESERVOIR_SIZE = 10_000; + static final int COORDINATOR_MAX_RESERVOIR_SIZE = 1_000_000; + static final int COORDINATOR_TARGET_PARTITIONS_MULTIPLIER = 100; + static final int OPERATOR_OVER_SAMPLE_RATIO = 10; + + // switch the statistics tracking from map to sketch if the cardinality of the sort key is over + // this threshold. It is hardcoded for now, we can revisit in the future if config is needed. + static final int OPERATOR_SKETCH_SWITCH_THRESHOLD = 10_000; + static final int COORDINATOR_SKETCH_SWITCH_THRESHOLD = 100_000; + + private SketchUtil() {} + + /** + * The larger the reservoir size, the more accurate for range bounds calculation and the more + * balanced range distribution. + * + *

Here are the heuristic rules + *

  • Target size: numPartitions x 100 to achieve good accuracy and is easier to calculate the + * range bounds + *
  • Min is 10K to achieve good accuracy while memory footprint is still relatively small + *
  • Max is 1M to cap the memory footprint on coordinator + * + * @param numPartitions number of range partitions which equals to downstream operator parallelism + * @return reservoir size + */ + static int determineCoordinatorReservoirSize(int numPartitions) { + int reservoirSize = numPartitions * COORDINATOR_TARGET_PARTITIONS_MULTIPLIER; + + if (reservoirSize < COORDINATOR_MIN_RESERVOIR_SIZE) { + // adjust it up and still make reservoirSize divisible by numPartitions + int remainder = COORDINATOR_MIN_RESERVOIR_SIZE % numPartitions; + reservoirSize = COORDINATOR_MIN_RESERVOIR_SIZE + (numPartitions - remainder); + } else if (reservoirSize > COORDINATOR_MAX_RESERVOIR_SIZE) { + // adjust it down and still make reservoirSize divisible by numPartitions + int remainder = COORDINATOR_MAX_RESERVOIR_SIZE % numPartitions; + reservoirSize = COORDINATOR_MAX_RESERVOIR_SIZE - remainder; + } + + return reservoirSize; + } + + /** + * Determine the sampling reservoir size where operator subtasks collect data statistics. + * + *

    Here are the heuristic rules + *

  • Target size is "coordinator reservoir size * over sampling ration (10) / operator + * parallelism" + *
  • Min is 1K to achieve good accuracy while memory footprint is still relatively small + *
  • Max is 100K to cap the memory footprint on coordinator + * + * @param numPartitions number of range partitions which equals to downstream operator parallelism + * @param operatorParallelism data statistics operator parallelism + * @return reservoir size + */ + static int determineOperatorReservoirSize(int operatorParallelism, int numPartitions) { + int coordinatorReservoirSize = determineCoordinatorReservoirSize(numPartitions); + int totalOperatorSamples = coordinatorReservoirSize * OPERATOR_OVER_SAMPLE_RATIO; + return (int) Math.ceil((double) totalOperatorSamples / operatorParallelism); + } + + /** + * To understand how range bounds are used in range partitioning, here is an example for human + * ages with 4 partitions: [15, 32, 60]. The 4 ranges would be + * + *
      + *
    • age <= 15 + *
    • age > 15 && age <= 32 + *
    • age >32 && age <= 60 + *
    • age > 60 + *
    + * + *

    Assumption is that a single key is not dominant enough to span multiple subtasks. + * + * @param numPartitions number of partitions which maps to downstream operator parallelism + * @param samples sampled keys + * @return array of range partition bounds. It should be a sorted list (ascending). Number of + * items should be {@code numPartitions - 1}. if numPartitions is 1, return an empty list + */ + static SortKey[] rangeBounds( + int numPartitions, Comparator comparator, SortKey[] samples) { + // sort the keys first + Arrays.sort(samples, comparator); + int numCandidates = numPartitions - 1; + List candidatesList = Lists.newLinkedList(); + int step = (int) Math.ceil((double) samples.length / numPartitions); + int position = step - 1; + int numChosen = 0; + while (position < samples.length && numChosen < numCandidates) { + SortKey candidate = samples[position]; + // skip duplicate values + if (numChosen > 0 && candidate.equals(candidatesList.get(candidatesList.size() - 1))) { + // linear probe for the next distinct value + position += 1; + } else { + candidatesList.add(candidate); + position += step; + numChosen += 1; + } + } + SortKey[] candidates = candidatesList.toArray(new SortKey[0]); + return candidates; + } + + /** This can be a bit expensive since it is quadratic. */ + static void convertMapToSketch( + Map taskMapStats, Consumer sketchConsumer) { + taskMapStats.forEach( + (sortKey, count) -> { + for (int i = 0; i < count; ++i) { + sketchConsumer.accept(sortKey); + } + }); + } + + static int partition( + SortKey key, int numPartitions, SortKey[] rangeBounds, Comparator comparator) { + int partition = Arrays.binarySearch(rangeBounds, key, comparator); + + // binarySearch either returns the match location or -[insertion point]-1 + if (partition < 0) { + partition = -partition - 1; + } + + if (partition > rangeBounds.length) { + partition = rangeBounds.length; + } + + return RangePartitioner.adjustPartitionWithRescale( + partition, rangeBounds.length + 1, numPartitions); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java new file mode 100644 index 000000000000..6f5bb6722771 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java @@ -0,0 +1,406 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Locale; +import java.util.Objects; +import java.util.Set; +import java.util.UUID; +import java.util.stream.Collectors; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.api.common.typeutils.TypeSerializerSchemaCompatibility; +import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; +import org.apache.flink.core.memory.DataInputView; +import org.apache.flink.core.memory.DataOutputView; +import org.apache.flink.util.StringUtils; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.SortField; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.SortOrderParser; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.types.CheckCompatibility; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.types.Types; + +class SortKeySerializer extends TypeSerializer { + private final Schema schema; + private final SortOrder sortOrder; + private final int size; + private final Types.NestedField[] transformedFields; + + private int version; + + private transient SortKey sortKey; + + SortKeySerializer(Schema schema, SortOrder sortOrder, int version) { + this.version = version; + this.schema = schema; + this.sortOrder = sortOrder; + this.size = sortOrder.fields().size(); + + this.transformedFields = new Types.NestedField[size]; + for (int i = 0; i < size; ++i) { + SortField sortField = sortOrder.fields().get(i); + Types.NestedField sourceField = schema.findField(sortField.sourceId()); + Type resultType = sortField.transform().getResultType(sourceField.type()); + Types.NestedField transformedField = + Types.NestedField.from(sourceField).ofType(resultType).build(); + transformedFields[i] = transformedField; + } + } + + SortKeySerializer(Schema schema, SortOrder sortOrder) { + this(schema, sortOrder, SortKeySerializerSnapshot.CURRENT_VERSION); + } + + private SortKey lazySortKey() { + if (sortKey == null) { + this.sortKey = new SortKey(schema, sortOrder); + } + + return sortKey; + } + + public int getLatestVersion() { + return snapshotConfiguration().getCurrentVersion(); + } + + public void restoreToLatestVersion() { + this.version = snapshotConfiguration().getCurrentVersion(); + } + + public void setVersion(int version) { + this.version = version; + } + + @Override + public boolean isImmutableType() { + return false; + } + + @Override + public TypeSerializer duplicate() { + return new SortKeySerializer(schema, sortOrder); + } + + @Override + public SortKey createInstance() { + return new SortKey(schema, sortOrder); + } + + @Override + public SortKey copy(SortKey from) { + return from.copy(); + } + + @Override + public SortKey copy(SortKey from, SortKey reuse) { + // no benefit of reuse + return copy(from); + } + + @Override + public int getLength() { + return -1; + } + + @Override + public void serialize(SortKey record, DataOutputView target) throws IOException { + Preconditions.checkArgument( + record.size() == size, + "Invalid size of the sort key object: %s. Expected %s", + record.size(), + size); + for (int i = 0; i < size; ++i) { + int fieldId = transformedFields[i].fieldId(); + Type.TypeID typeId = transformedFields[i].type().typeId(); + if (version > 1) { + Object value = record.get(i, Object.class); + if (value == null) { + target.writeBoolean(true); + continue; + } else { + target.writeBoolean(false); + } + } + + switch (typeId) { + case BOOLEAN: + target.writeBoolean(record.get(i, Boolean.class)); + break; + case INTEGER: + case DATE: + target.writeInt(record.get(i, Integer.class)); + break; + case LONG: + case TIME: + case TIMESTAMP: + target.writeLong(record.get(i, Long.class)); + break; + case FLOAT: + target.writeFloat(record.get(i, Float.class)); + break; + case DOUBLE: + target.writeDouble(record.get(i, Double.class)); + break; + case STRING: + target.writeUTF(record.get(i, CharSequence.class).toString()); + break; + case UUID: + UUID uuid = record.get(i, UUID.class); + target.writeLong(uuid.getMostSignificantBits()); + target.writeLong(uuid.getLeastSignificantBits()); + break; + case FIXED: + case BINARY: + byte[] bytes = record.get(i, ByteBuffer.class).array(); + target.writeInt(bytes.length); + target.write(bytes); + break; + case DECIMAL: + BigDecimal decimal = record.get(i, BigDecimal.class); + byte[] decimalBytes = decimal.unscaledValue().toByteArray(); + target.writeInt(decimalBytes.length); + target.write(decimalBytes); + target.writeInt(decimal.scale()); + break; + case STRUCT: + case MAP: + case LIST: + default: + // SortKey transformation is a flattened struct without list and map + throw new UnsupportedOperationException( + String.format( + Locale.ROOT, "Field %d has unsupported field type: %s", fieldId, typeId)); + } + } + } + + @Override + public SortKey deserialize(DataInputView source) throws IOException { + // copying is a little faster than constructing a new SortKey object + SortKey deserialized = lazySortKey().copy(); + deserialize(deserialized, source); + return deserialized; + } + + @Override + public SortKey deserialize(SortKey reuse, DataInputView source) throws IOException { + Preconditions.checkArgument( + reuse.size() == size, + "Invalid size of the sort key object: %s. Expected %s", + reuse.size(), + size); + for (int i = 0; i < size; ++i) { + if (version > 1) { + boolean isNull = source.readBoolean(); + if (isNull) { + reuse.set(i, null); + continue; + } + } + + int fieldId = transformedFields[i].fieldId(); + Type.TypeID typeId = transformedFields[i].type().typeId(); + switch (typeId) { + case BOOLEAN: + reuse.set(i, source.readBoolean()); + break; + case INTEGER: + case DATE: + reuse.set(i, source.readInt()); + break; + case LONG: + case TIME: + case TIMESTAMP: + reuse.set(i, source.readLong()); + break; + case FLOAT: + reuse.set(i, source.readFloat()); + break; + case DOUBLE: + reuse.set(i, source.readDouble()); + break; + case STRING: + reuse.set(i, source.readUTF()); + break; + case UUID: + long mostSignificantBits = source.readLong(); + long leastSignificantBits = source.readLong(); + reuse.set(i, new UUID(mostSignificantBits, leastSignificantBits)); + break; + case FIXED: + case BINARY: + byte[] bytes = new byte[source.readInt()]; + source.read(bytes); + reuse.set(i, ByteBuffer.wrap(bytes)); + break; + case DECIMAL: + byte[] unscaledBytes = new byte[source.readInt()]; + source.read(unscaledBytes); + int scale = source.readInt(); + BigDecimal decimal = new BigDecimal(new BigInteger(unscaledBytes), scale); + reuse.set(i, decimal); + break; + case STRUCT: + case MAP: + case LIST: + default: + // SortKey transformation is a flattened struct without list and map + throw new UnsupportedOperationException( + String.format( + Locale.ROOT, "Field %d has unsupported field type: %s", fieldId, typeId)); + } + } + + return reuse; + } + + @Override + public void copy(DataInputView source, DataOutputView target) throws IOException { + // no optimization here + serialize(deserialize(source), target); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof SortKeySerializer)) { + return false; + } + + SortKeySerializer other = (SortKeySerializer) obj; + return Objects.equals(schema.asStruct(), other.schema.asStruct()) + && Objects.equals(sortOrder, other.sortOrder); + } + + @Override + public int hashCode() { + return schema.asStruct().hashCode() * 31 + sortOrder.hashCode(); + } + + @Override + public TypeSerializerSnapshot snapshotConfiguration() { + return new SortKeySerializerSnapshot(schema, sortOrder); + } + + public static class SortKeySerializerSnapshot implements TypeSerializerSnapshot { + private static final int CURRENT_VERSION = 2; + + private Schema schema; + private SortOrder sortOrder; + + private int version = CURRENT_VERSION; + + /** Constructor for read instantiation. */ + @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) + public SortKeySerializerSnapshot() { + // this constructor is used when restoring from a checkpoint. + } + + @SuppressWarnings("checkstyle:RedundantModifier") + public SortKeySerializerSnapshot(Schema schema, SortOrder sortOrder) { + this.schema = schema; + this.sortOrder = sortOrder; + } + + @Override + public int getCurrentVersion() { + return CURRENT_VERSION; + } + + @Override + public void writeSnapshot(DataOutputView out) throws IOException { + Preconditions.checkState(schema != null, "Invalid schema: null"); + Preconditions.checkState(sortOrder != null, "Invalid sort order: null"); + + StringUtils.writeString(SchemaParser.toJson(schema), out); + StringUtils.writeString(SortOrderParser.toJson(sortOrder), out); + } + + @Override + public void readSnapshot(int readVersion, DataInputView in, ClassLoader userCodeClassLoader) + throws IOException { + switch (readVersion) { + case 1: + read(in); + this.version = 1; + break; + case 2: + read(in); + break; + default: + throw new IllegalArgumentException("Unknown read version: " + readVersion); + } + } + + @Override + public TypeSerializerSchemaCompatibility resolveSchemaCompatibility( + TypeSerializerSnapshot oldSerializerSnapshot) { + if (!(oldSerializerSnapshot instanceof SortKeySerializerSnapshot)) { + return TypeSerializerSchemaCompatibility.incompatible(); + } + + if (oldSerializerSnapshot.getCurrentVersion() == 1 && this.getCurrentVersion() == 2) { + return TypeSerializerSchemaCompatibility.compatibleAfterMigration(); + } + + // Sort order should be identical + SortKeySerializerSnapshot oldSnapshot = (SortKeySerializerSnapshot) oldSerializerSnapshot; + if (!sortOrder.sameOrder(oldSnapshot.sortOrder)) { + return TypeSerializerSchemaCompatibility.incompatible(); + } + + Set sortFieldIds = + sortOrder.fields().stream().map(SortField::sourceId).collect(Collectors.toSet()); + // only care about the schema related to sort fields + Schema sortSchema = TypeUtil.project(schema, sortFieldIds); + Schema oldSortSchema = TypeUtil.project(oldSnapshot.schema, sortFieldIds); + + List compatibilityErrors = + CheckCompatibility.writeCompatibilityErrors(sortSchema, oldSortSchema); + if (compatibilityErrors.isEmpty()) { + return TypeSerializerSchemaCompatibility.compatibleAsIs(); + } + + return TypeSerializerSchemaCompatibility.incompatible(); + } + + @Override + public TypeSerializer restoreSerializer() { + Preconditions.checkState(schema != null, "Invalid schema: null"); + Preconditions.checkState(sortOrder != null, "Invalid sort order: null"); + return new SortKeySerializer(schema, sortOrder, version); + } + + private void read(DataInputView in) throws IOException { + String schemaJson = StringUtils.readString(in); + String sortOrderJson = StringUtils.readString(in); + this.schema = SchemaParser.fromJson(schemaJson); + this.sortOrder = SortOrderParser.fromJson(sortOrderJson).bind(schema); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java new file mode 100644 index 000000000000..d6c23f035015 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.io.IOException; +import java.io.Serializable; +import java.io.UncheckedIOException; +import java.util.Arrays; +import java.util.List; +import org.apache.datasketches.common.ArrayOfItemsSerDe; +import org.apache.datasketches.common.ArrayOfStringsSerDe; +import org.apache.datasketches.common.ByteArrayUtil; +import org.apache.datasketches.common.Util; +import org.apache.datasketches.memory.Memory; +import org.apache.datasketches.sampling.ReservoirItemsSketch; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.api.common.typeutils.base.ListSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * Only way to implement {@link ReservoirItemsSketch} serializer is to extend from {@link + * ArrayOfItemsSerDe}, as deserialization uses a private constructor from ReservoirItemsSketch. The + * implementation is modeled after {@link ArrayOfStringsSerDe} + */ +class SortKeySketchSerializer extends ArrayOfItemsSerDe implements Serializable { + private static final int DEFAULT_SORT_KEY_SIZE = 128; + + private final TypeSerializer itemSerializer; + private final ListSerializer listSerializer; + private final DataInputDeserializer input; + + SortKeySketchSerializer(TypeSerializer itemSerializer) { + this.itemSerializer = itemSerializer; + this.listSerializer = new ListSerializer<>(itemSerializer); + this.input = new DataInputDeserializer(); + } + + @Override + public byte[] serializeToByteArray(SortKey item) { + try { + DataOutputSerializer output = new DataOutputSerializer(DEFAULT_SORT_KEY_SIZE); + itemSerializer.serialize(item, output); + byte[] itemBytes = output.getSharedBuffer(); + int numBytes = output.length(); + byte[] out = new byte[numBytes + Integer.BYTES]; + ByteArrayUtil.copyBytes(itemBytes, 0, out, 4, numBytes); + ByteArrayUtil.putIntLE(out, 0, numBytes); + return out; + } catch (IOException e) { + throw new UncheckedIOException("Failed to serialize sort key", e); + } + } + + @Override + public byte[] serializeToByteArray(SortKey[] items) { + try { + DataOutputSerializer output = new DataOutputSerializer(DEFAULT_SORT_KEY_SIZE * items.length); + listSerializer.serialize(Arrays.asList(items), output); + byte[] itemsBytes = output.getSharedBuffer(); + int numBytes = output.length(); + byte[] out = new byte[Integer.BYTES + numBytes]; + ByteArrayUtil.putIntLE(out, 0, numBytes); + System.arraycopy(itemsBytes, 0, out, Integer.BYTES, numBytes); + return out; + } catch (IOException e) { + throw new UncheckedIOException("Failed to serialize sort key", e); + } + } + + @Override + public SortKey[] deserializeFromMemory(Memory mem, long startingOffset, int numItems) { + Preconditions.checkArgument(mem != null, "Invalid input memory: null"); + if (numItems <= 0) { + return new SortKey[0]; + } + + long offset = startingOffset; + Util.checkBounds(offset, Integer.BYTES, mem.getCapacity()); + int numBytes = mem.getInt(offset); + offset += Integer.BYTES; + + Util.checkBounds(offset, numBytes, mem.getCapacity()); + byte[] sortKeyBytes = new byte[numBytes]; + mem.getByteArray(offset, sortKeyBytes, 0, numBytes); + input.setBuffer(sortKeyBytes); + + try { + List sortKeys = listSerializer.deserialize(input); + SortKey[] array = new SortKey[numItems]; + sortKeys.toArray(array); + input.releaseArrays(); + return array; + } catch (IOException e) { + throw new UncheckedIOException("Failed to deserialize sort key sketch", e); + } + } + + @Override + public int sizeOf(SortKey item) { + return serializeToByteArray(item).length; + } + + @Override + public int sizeOf(Memory mem, long offset, int numItems) { + Preconditions.checkArgument(mem != null, "Invalid input memory: null"); + if (numItems <= 0) { + return 0; + } + + Util.checkBounds(offset, Integer.BYTES, mem.getCapacity()); + int numBytes = mem.getInt(offset); + return Integer.BYTES + numBytes; + } + + @Override + public String toString(SortKey item) { + return item.toString(); + } + + @Override + public Class getClassOfT() { + return SortKey.class; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java new file mode 100644 index 000000000000..891f2f02a241 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.List; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortField; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; + +class SortKeyUtil { + private SortKeyUtil() {} + + /** Compute the result schema of {@code SortKey} transformation */ + static Schema sortKeySchema(Schema schema, SortOrder sortOrder) { + List sortFields = sortOrder.fields(); + int size = sortFields.size(); + List transformedFields = Lists.newArrayListWithCapacity(size); + for (int i = 0; i < size; ++i) { + int sourceFieldId = sortFields.get(i).sourceId(); + Types.NestedField sourceField = schema.findField(sourceFieldId); + Preconditions.checkArgument( + sourceField != null, "Cannot find source field: %s", sourceFieldId); + Type transformedType = sortFields.get(i).transform().getResultType(sourceField.type()); + // There could be multiple transformations on the same source column, like in the PartitionKey + // case. To resolve the collision, field id is set to transform index and field name is set to + // sourceFieldName_transformIndex + Types.NestedField transformedField = + Types.NestedField.from(sourceField) + .withId(i) + .withName(sourceField.name() + '_' + i) + .ofType(transformedType) + .build(); + transformedFields.add(transformedField); + } + + return new Schema(transformedFields); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java new file mode 100644 index 000000000000..f6fcdb8b16ef --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; + +/** + * DataStatisticsEvent is sent between data statistics coordinator and operator to transmit data + * statistics in bytes + */ +@Internal +class StatisticsEvent implements OperatorEvent { + + private static final long serialVersionUID = 1L; + private final long checkpointId; + private final byte[] statisticsBytes; + private final boolean applyImmediately; + + private StatisticsEvent(long checkpointId, byte[] statisticsBytes, boolean applyImmediately) { + this.checkpointId = checkpointId; + this.statisticsBytes = statisticsBytes; + this.applyImmediately = applyImmediately; + } + + static StatisticsEvent createTaskStatisticsEvent( + long checkpointId, + DataStatistics statistics, + TypeSerializer statisticsSerializer) { + // applyImmediately is really only relevant for coordinator to operator event. + // task reported statistics is always merged immediately by the coordinator. + return new StatisticsEvent( + checkpointId, + StatisticsUtil.serializeDataStatistics(statistics, statisticsSerializer), + true); + } + + static StatisticsEvent createGlobalStatisticsEvent( + GlobalStatistics statistics, + TypeSerializer statisticsSerializer, + boolean applyImmediately) { + return new StatisticsEvent( + statistics.checkpointId(), + StatisticsUtil.serializeGlobalStatistics(statistics, statisticsSerializer), + applyImmediately); + } + + long checkpointId() { + return checkpointId; + } + + byte[] statisticsBytes() { + return statisticsBytes; + } + + boolean applyImmediately() { + return applyImmediately; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java new file mode 100644 index 000000000000..bc28df2b0e22 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.io.Serializable; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * The wrapper class for data statistics and record. It is the only way for data statistics operator + * to send global data statistics to custom partitioner to distribute data based on statistics + * + *

    DataStatisticsOrRecord contains either data statistics(globally aggregated) or a record. It is + * sent from {@link DataStatisticsOperator} to partitioner. Once partitioner receives the data + * statistics, it will use that to decide the coming record should send to which writer subtask. + * After shuffling, a filter and mapper are required to filter out the data distribution weight, + * unwrap the object and extract the original record type T. + */ +@Internal +public class StatisticsOrRecord implements Serializable { + + private static final long serialVersionUID = 1L; + + private GlobalStatistics statistics; + private RowData record; + + private StatisticsOrRecord(GlobalStatistics statistics, RowData record) { + Preconditions.checkArgument( + record != null ^ statistics != null, "DataStatistics or record, not neither or both"); + this.statistics = statistics; + this.record = record; + } + + static StatisticsOrRecord fromRecord(RowData record) { + return new StatisticsOrRecord(null, record); + } + + static StatisticsOrRecord fromStatistics(GlobalStatistics statistics) { + return new StatisticsOrRecord(statistics, null); + } + + static StatisticsOrRecord reuseRecord( + StatisticsOrRecord reuse, TypeSerializer recordSerializer) { + if (reuse.hasRecord()) { + return reuse; + } else { + // not reusable + return StatisticsOrRecord.fromRecord(recordSerializer.createInstance()); + } + } + + static StatisticsOrRecord reuseStatistics( + StatisticsOrRecord reuse, TypeSerializer statisticsSerializer) { + if (reuse.hasStatistics()) { + return reuse; + } else { + // not reusable + return StatisticsOrRecord.fromStatistics(statisticsSerializer.createInstance()); + } + } + + boolean hasStatistics() { + return statistics != null; + } + + public boolean hasRecord() { + return record != null; + } + + GlobalStatistics statistics() { + return statistics; + } + + void statistics(GlobalStatistics newStatistics) { + this.statistics = newStatistics; + } + + public RowData record() { + return record; + } + + void record(RowData newRecord) { + this.record = newRecord; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("statistics", statistics) + .add("record", record) + .toString(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java new file mode 100644 index 000000000000..d4ae2b359679 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.io.IOException; +import java.util.Objects; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; +import org.apache.flink.core.memory.DataInputView; +import org.apache.flink.core.memory.DataOutputView; +import org.apache.flink.table.data.RowData; + +@Internal +class StatisticsOrRecordSerializer extends TypeSerializer { + private final TypeSerializer statisticsSerializer; + private final TypeSerializer recordSerializer; + + StatisticsOrRecordSerializer( + TypeSerializer statisticsSerializer, + TypeSerializer recordSerializer) { + this.statisticsSerializer = statisticsSerializer; + this.recordSerializer = recordSerializer; + } + + @Override + public boolean isImmutableType() { + return false; + } + + @SuppressWarnings("ReferenceEquality") + @Override + public TypeSerializer duplicate() { + TypeSerializer duplicateStatisticsSerializer = + statisticsSerializer.duplicate(); + TypeSerializer duplicateRowDataSerializer = recordSerializer.duplicate(); + if ((statisticsSerializer != duplicateStatisticsSerializer) + || (recordSerializer != duplicateRowDataSerializer)) { + return new StatisticsOrRecordSerializer( + duplicateStatisticsSerializer, duplicateRowDataSerializer); + } else { + return this; + } + } + + @Override + public StatisticsOrRecord createInstance() { + // arbitrarily always create RowData value instance + return StatisticsOrRecord.fromRecord(recordSerializer.createInstance()); + } + + @Override + public StatisticsOrRecord copy(StatisticsOrRecord from) { + if (from.hasRecord()) { + return StatisticsOrRecord.fromRecord(recordSerializer.copy(from.record())); + } else { + return StatisticsOrRecord.fromStatistics(statisticsSerializer.copy(from.statistics())); + } + } + + @Override + public StatisticsOrRecord copy(StatisticsOrRecord from, StatisticsOrRecord reuse) { + StatisticsOrRecord to; + if (from.hasRecord()) { + to = StatisticsOrRecord.reuseRecord(reuse, recordSerializer); + RowData record = recordSerializer.copy(from.record(), to.record()); + to.record(record); + } else { + to = StatisticsOrRecord.reuseStatistics(reuse, statisticsSerializer); + GlobalStatistics statistics = statisticsSerializer.copy(from.statistics(), to.statistics()); + to.statistics(statistics); + } + + return to; + } + + @Override + public int getLength() { + return -1; + } + + @Override + public void serialize(StatisticsOrRecord statisticsOrRecord, DataOutputView target) + throws IOException { + if (statisticsOrRecord.hasRecord()) { + target.writeBoolean(true); + recordSerializer.serialize(statisticsOrRecord.record(), target); + } else { + target.writeBoolean(false); + statisticsSerializer.serialize(statisticsOrRecord.statistics(), target); + } + } + + @Override + public StatisticsOrRecord deserialize(DataInputView source) throws IOException { + boolean isRecord = source.readBoolean(); + if (isRecord) { + return StatisticsOrRecord.fromRecord(recordSerializer.deserialize(source)); + } else { + return StatisticsOrRecord.fromStatistics(statisticsSerializer.deserialize(source)); + } + } + + @Override + public StatisticsOrRecord deserialize(StatisticsOrRecord reuse, DataInputView source) + throws IOException { + StatisticsOrRecord to; + boolean isRecord = source.readBoolean(); + if (isRecord) { + to = StatisticsOrRecord.reuseRecord(reuse, recordSerializer); + RowData record = recordSerializer.deserialize(to.record(), source); + to.record(record); + } else { + to = StatisticsOrRecord.reuseStatistics(reuse, statisticsSerializer); + GlobalStatistics statistics = statisticsSerializer.deserialize(to.statistics(), source); + to.statistics(statistics); + } + + return to; + } + + @Override + public void copy(DataInputView source, DataOutputView target) throws IOException { + boolean hasRecord = source.readBoolean(); + target.writeBoolean(hasRecord); + if (hasRecord) { + recordSerializer.copy(source, target); + } else { + statisticsSerializer.copy(source, target); + } + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof StatisticsOrRecordSerializer)) { + return false; + } + + StatisticsOrRecordSerializer other = (StatisticsOrRecordSerializer) obj; + return Objects.equals(statisticsSerializer, other.statisticsSerializer) + && Objects.equals(recordSerializer, other.recordSerializer); + } + + @Override + public int hashCode() { + return Objects.hash(statisticsSerializer, recordSerializer); + } + + @Override + public TypeSerializerSnapshot snapshotConfiguration() { + return new StatisticsOrRecordSerializerSnapshot(this); + } + + public static class StatisticsOrRecordSerializerSnapshot + extends CompositeTypeSerializerSnapshot { + private static final int CURRENT_VERSION = 1; + + /** Constructor for read instantiation. */ + @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) + public StatisticsOrRecordSerializerSnapshot() {} + + @SuppressWarnings("checkstyle:RedundantModifier") + public StatisticsOrRecordSerializerSnapshot(StatisticsOrRecordSerializer serializer) { + super(serializer); + } + + @SuppressWarnings("checkstyle:RedundantModifier") + @Override + protected int getCurrentOuterSnapshotVersion() { + return CURRENT_VERSION; + } + + @Override + protected TypeSerializer[] getNestedSerializers( + StatisticsOrRecordSerializer outerSerializer) { + return new TypeSerializer[] { + outerSerializer.statisticsSerializer, outerSerializer.recordSerializer + }; + } + + @SuppressWarnings("unchecked") + @Override + protected StatisticsOrRecordSerializer createOuterSerializerWithNestedSerializers( + TypeSerializer[] nestedSerializers) { + TypeSerializer statisticsSerializer = + (TypeSerializer) nestedSerializers[0]; + TypeSerializer recordSerializer = (TypeSerializer) nestedSerializers[1]; + return new StatisticsOrRecordSerializer(statisticsSerializer, recordSerializer); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordTypeInformation.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordTypeInformation.java new file mode 100644 index 000000000000..aa3af2bd2e38 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordTypeInformation.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Objects; +import org.apache.flink.api.common.serialization.SerializerConfig; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; + +public class StatisticsOrRecordTypeInformation extends TypeInformation { + + private final TypeInformation rowTypeInformation; + private final SortOrder sortOrder; + private final GlobalStatisticsSerializer globalStatisticsSerializer; + + public StatisticsOrRecordTypeInformation( + RowType flinkRowType, Schema schema, SortOrder sortOrder) { + this.sortOrder = sortOrder; + this.rowTypeInformation = FlinkCompatibilityUtil.toTypeInfo(flinkRowType); + this.globalStatisticsSerializer = + new GlobalStatisticsSerializer(new SortKeySerializer(schema, sortOrder)); + } + + @Override + public boolean isBasicType() { + return false; + } + + @Override + public boolean isTupleType() { + return false; + } + + @Override + public int getArity() { + return 1; + } + + @Override + public int getTotalFields() { + return 1; + } + + @Override + public Class getTypeClass() { + return StatisticsOrRecord.class; + } + + @Override + public boolean isKeyType() { + return false; + } + + @Override + public TypeSerializer createSerializer(SerializerConfig config) { + TypeSerializer recordSerializer = rowTypeInformation.createSerializer(config); + return new StatisticsOrRecordSerializer(globalStatisticsSerializer, recordSerializer); + } + + @Override + public String toString() { + return "StatisticsOrRecord"; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } else if (o != null && this.getClass() == o.getClass()) { + StatisticsOrRecordTypeInformation that = (StatisticsOrRecordTypeInformation) o; + return that.sortOrder.equals(sortOrder) + && that.rowTypeInformation.equals(rowTypeInformation) + && that.globalStatisticsSerializer.equals(globalStatisticsSerializer); + } else { + return false; + } + } + + @Override + public int hashCode() { + return Objects.hash(rowTypeInformation, sortOrder, globalStatisticsSerializer); + } + + @Override + public boolean canEqual(Object obj) { + return obj instanceof StatisticsOrRecordTypeInformation; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java new file mode 100644 index 000000000000..43f72e336e06 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +/** + * Range distribution requires gathering statistics on the sort keys to determine proper range + * boundaries to distribute/cluster rows before writer operators. + */ +public enum StatisticsType { + /** + * Tracks the data statistics as {@code Map} frequency. It works better for + * low-cardinality scenarios (like country, event_type, etc.) where the cardinalities are in + * hundreds or thousands. + * + *

      + *
    • Pro: accurate measurement on the statistics/weight of every key. + *
    • Con: memory footprint can be large if the key cardinality is high. + *
    + */ + Map, + + /** + * Sample the sort keys via reservoir sampling. Then split the range partitions via range bounds + * from sampled values. It works better for high-cardinality scenarios (like device_id, user_id, + * uuid etc.) where the cardinalities can be in millions or billions. + * + *
      + *
    • Pro: relatively low memory footprint for high-cardinality sort keys. + *
    • Con: non-precise approximation with potentially lower accuracy. + *
    + */ + Sketch, + + /** + * Initially use Map for statistics tracking. If key cardinality turns out to be high, + * automatically switch to sketch sampling. + */ + Auto +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java new file mode 100644 index 000000000000..f2efc7fa9834 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.io.IOException; +import java.io.UncheckedIOException; +import javax.annotation.Nullable; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; + +class StatisticsUtil { + + private StatisticsUtil() {} + + static DataStatistics createTaskStatistics( + StatisticsType type, int operatorParallelism, int numPartitions) { + if (type == StatisticsType.Map) { + return new MapDataStatistics(); + } else { + return new SketchDataStatistics( + SketchUtil.determineOperatorReservoirSize(operatorParallelism, numPartitions)); + } + } + + static byte[] serializeDataStatistics( + DataStatistics dataStatistics, TypeSerializer statisticsSerializer) { + DataOutputSerializer out = new DataOutputSerializer(64); + try { + statisticsSerializer.serialize(dataStatistics, out); + return out.getCopyOfBuffer(); + } catch (IOException e) { + throw new UncheckedIOException("Fail to serialize data statistics", e); + } + } + + static DataStatistics deserializeDataStatistics( + byte[] bytes, TypeSerializer statisticsSerializer) { + DataInputDeserializer input = new DataInputDeserializer(bytes, 0, bytes.length); + try { + return statisticsSerializer.deserialize(input); + } catch (IOException e) { + throw new UncheckedIOException("Fail to deserialize data statistics", e); + } + } + + static byte[] serializeCompletedStatistics( + CompletedStatistics completedStatistics, + TypeSerializer statisticsSerializer) { + try { + DataOutputSerializer out = new DataOutputSerializer(1024); + statisticsSerializer.serialize(completedStatistics, out); + return out.getCopyOfBuffer(); + } catch (IOException e) { + throw new UncheckedIOException("Fail to serialize aggregated statistics", e); + } + } + + static CompletedStatistics deserializeCompletedStatistics( + byte[] bytes, CompletedStatisticsSerializer statisticsSerializer) { + try { + DataInputDeserializer input = new DataInputDeserializer(bytes); + CompletedStatistics completedStatistics = statisticsSerializer.deserialize(input); + if (!completedStatistics.isValid()) { + throw new RuntimeException("Fail to deserialize aggregated statistics,change to v1"); + } + + return completedStatistics; + } catch (Exception e) { + try { + // If we restore from a lower version, the new version of SortKeySerializer cannot correctly + // parse the checkpointData, so we need to first switch the version to v1. Once the state + // data is successfully parsed, we need to switch the serialization version to the latest + // version to parse the subsequent data passed from the TM. + statisticsSerializer.changeSortKeySerializerVersion(1); + DataInputDeserializer input = new DataInputDeserializer(bytes); + CompletedStatistics deserialize = statisticsSerializer.deserialize(input); + statisticsSerializer.changeSortKeySerializerVersionLatest(); + return deserialize; + } catch (IOException ioException) { + throw new UncheckedIOException("Fail to deserialize aggregated statistics", ioException); + } + } + } + + static byte[] serializeGlobalStatistics( + GlobalStatistics globalStatistics, TypeSerializer statisticsSerializer) { + try { + DataOutputSerializer out = new DataOutputSerializer(1024); + statisticsSerializer.serialize(globalStatistics, out); + return out.getCopyOfBuffer(); + } catch (IOException e) { + throw new UncheckedIOException("Fail to serialize aggregated statistics", e); + } + } + + static GlobalStatistics deserializeGlobalStatistics( + byte[] bytes, TypeSerializer statisticsSerializer) { + try { + DataInputDeserializer input = new DataInputDeserializer(bytes); + return statisticsSerializer.deserialize(input); + } catch (IOException e) { + throw new UncheckedIOException("Fail to deserialize aggregated statistics", e); + } + } + + static StatisticsType collectType(StatisticsType config) { + return config == StatisticsType.Sketch ? StatisticsType.Sketch : StatisticsType.Map; + } + + static StatisticsType collectType(StatisticsType config, @Nullable GlobalStatistics statistics) { + if (statistics != null) { + return statistics.type(); + } + + return collectType(config); + } + + static StatisticsType collectType( + StatisticsType config, @Nullable CompletedStatistics statistics) { + if (statistics != null) { + return statistics.type(); + } + + return collectType(config); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java new file mode 100644 index 000000000000..796434c45136 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import org.apache.avro.generic.GenericRecord; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.encryption.InputFilesDecryptor; +import org.apache.iceberg.io.CloseableIterator; + +public class AvroGenericRecordFileScanTaskReader implements FileScanTaskReader { + private final RowDataFileScanTaskReader rowDataReader; + private final RowDataToAvroGenericRecordConverter converter; + + public AvroGenericRecordFileScanTaskReader( + RowDataFileScanTaskReader rowDataReader, RowDataToAvroGenericRecordConverter converter) { + this.rowDataReader = rowDataReader; + this.converter = converter; + } + + @Override + public CloseableIterator open( + FileScanTask fileScanTask, InputFilesDecryptor inputFilesDecryptor) { + return CloseableIterator.transform( + rowDataReader.open(fileScanTask, inputFilesDecryptor), converter); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java new file mode 100644 index 000000000000..3beda960cec8 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Iterator; +import java.util.Locale; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.encryption.InputFilesDecryptor; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * Flink data iterator that reads {@link CombinedScanTask} into a {@link CloseableIterator} + * + * @param is the output data type returned by this iterator. + */ +@Internal +public class DataIterator implements CloseableIterator { + + private final FileScanTaskReader fileScanTaskReader; + + private final InputFilesDecryptor inputFilesDecryptor; + private final CombinedScanTask combinedTask; + + private Iterator tasks; + private CloseableIterator currentIterator; + private int fileOffset; + private long recordOffset; + + public DataIterator( + FileScanTaskReader fileScanTaskReader, + CombinedScanTask task, + FileIO io, + EncryptionManager encryption) { + this.fileScanTaskReader = fileScanTaskReader; + + this.inputFilesDecryptor = new InputFilesDecryptor(task, io, encryption); + this.combinedTask = task; + + this.tasks = task.files().iterator(); + this.currentIterator = CloseableIterator.empty(); + + // fileOffset starts at -1 because we started + // from an empty iterator that is not from the split files. + this.fileOffset = -1; + // record offset points to the record that next() should return when called + this.recordOffset = 0L; + } + + /** + * (startingFileOffset, startingRecordOffset) points to the next row that reader should resume + * from. E.g., if the seek position is (file=0, record=1), seek moves the iterator position to the + * 2nd row in file 0. When next() is called after seek, 2nd row from file 0 should be returned. + */ + public void seek(int startingFileOffset, long startingRecordOffset) { + Preconditions.checkState( + fileOffset == -1, "Seek should be called before any other iterator actions"); + // skip files + Preconditions.checkState( + startingFileOffset < combinedTask.files().size(), + "Invalid starting file offset %s for combined scan task with %s files: %s", + startingFileOffset, + combinedTask.files().size(), + combinedTask); + for (long i = 0L; i < startingFileOffset; ++i) { + tasks.next(); + } + + updateCurrentIterator(); + // skip records within the file + for (long i = 0; i < startingRecordOffset; ++i) { + if (currentFileHasNext() && hasNext()) { + next(); + } else { + throw new IllegalStateException( + String.format( + Locale.ROOT, + "Invalid starting record offset %d for file %d from CombinedScanTask: %s", + startingRecordOffset, + startingFileOffset, + combinedTask)); + } + } + + fileOffset = startingFileOffset; + recordOffset = startingRecordOffset; + } + + @Override + public boolean hasNext() { + updateCurrentIterator(); + return currentIterator.hasNext(); + } + + @Override + public T next() { + updateCurrentIterator(); + recordOffset += 1; + return currentIterator.next(); + } + + public boolean currentFileHasNext() { + return currentIterator.hasNext(); + } + + /** Updates the current iterator field to ensure that the current Iterator is not exhausted. */ + private void updateCurrentIterator() { + try { + while (!currentIterator.hasNext() && tasks.hasNext()) { + currentIterator.close(); + currentIterator = openTaskIterator(tasks.next()); + fileOffset += 1; + recordOffset = 0L; + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private CloseableIterator openTaskIterator(FileScanTask scanTask) { + return fileScanTaskReader.open(scanTask, inputFilesDecryptor); + } + + @Override + public void close() throws IOException { + // close the current iterator + currentIterator.close(); + tasks = null; + } + + public int fileOffset() { + return fileOffset; + } + + public long recordOffset() { + return recordOffset; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java new file mode 100644 index 000000000000..4394dab4d4cc --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.Schema; +import org.apache.iceberg.encryption.InputFilesDecryptor; +import org.apache.iceberg.flink.data.StructRowData; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.CloseableIterator; + +@Internal +public class DataTaskReader implements FileScanTaskReader { + + private final Schema readSchema; + + public DataTaskReader(Schema readSchema) { + this.readSchema = readSchema; + } + + @Override + public CloseableIterator open( + FileScanTask task, InputFilesDecryptor inputFilesDecryptor) { + StructRowData row = new StructRowData(readSchema.asStruct()); + CloseableIterable iterable = + CloseableIterable.transform(task.asDataTask().rows(), row::setStruct); + return iterable.iterator(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java new file mode 100644 index 000000000000..927a804a4792 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.Serializable; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.encryption.InputFilesDecryptor; +import org.apache.iceberg.io.CloseableIterator; + +/** + * Read a {@link FileScanTask} into a {@link CloseableIterator} + * + * @param is the output data type returned by this iterator. + */ +@Internal +public interface FileScanTaskReader extends Serializable { + CloseableIterator open(FileScanTask fileScanTask, InputFilesDecryptor inputFilesDecryptor); +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java new file mode 100644 index 000000000000..a68f0e50e0d0 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.util.concurrent.ExecutorService; +import org.apache.flink.api.common.io.DefaultInputSplitAssigner; +import org.apache.flink.api.common.io.InputFormat; +import org.apache.flink.api.common.io.LocatableInputSplitAssigner; +import org.apache.flink.api.common.io.RichInputFormat; +import org.apache.flink.api.common.io.statistics.BaseStatistics; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.io.InputSplitAssigner; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.BaseMetadataTable; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.util.ThreadPools; + +/** Flink {@link InputFormat} for Iceberg. */ +public class FlinkInputFormat extends RichInputFormat { + + private static final long serialVersionUID = 1L; + + private final TableLoader tableLoader; + private final FileIO io; + private final EncryptionManager encryption; + private final ScanContext context; + private final FileScanTaskReader rowDataReader; + + private transient DataIterator iterator; + private transient long currentReadCount = 0L; + + FlinkInputFormat( + TableLoader tableLoader, + Schema tableSchema, + FileIO io, + EncryptionManager encryption, + ScanContext context) { + this.tableLoader = tableLoader; + this.io = io; + this.encryption = encryption; + this.context = context; + + tableLoader.open(); + Table table = tableLoader.loadTable(); + if (table instanceof BaseMetadataTable) { + this.rowDataReader = new DataTaskReader(context.project()); + } else { + this.rowDataReader = + new RowDataFileScanTaskReader( + tableSchema, + context.project(), + context.nameMapping(), + context.caseSensitive(), + context.filters()); + } + } + + @VisibleForTesting + Schema projectedSchema() { + return context.project(); + } + + @Override + public BaseStatistics getStatistics(BaseStatistics cachedStatistics) { + // Legacy method, not be used. + return null; + } + + @Override + public FlinkInputSplit[] createInputSplits(int minNumSplits) throws IOException { + // Called in Job manager, so it is OK to load table from catalog. + tableLoader.open(); + final ExecutorService workerPool = + ThreadPools.newFixedThreadPool("iceberg-plan-worker-pool", context.planParallelism()); + try (TableLoader loader = tableLoader) { + Table table = loader.loadTable(); + return FlinkSplitPlanner.planInputSplits(table, context, workerPool); + } finally { + workerPool.shutdown(); + } + } + + @Override + public InputSplitAssigner getInputSplitAssigner(FlinkInputSplit[] inputSplits) { + return context.exposeLocality() + ? new LocatableInputSplitAssigner(inputSplits) + : new DefaultInputSplitAssigner(inputSplits); + } + + @Override + public void configure(Configuration parameters) {} + + @Override + public void open(FlinkInputSplit split) { + this.iterator = new DataIterator<>(rowDataReader, split.getTask(), io, encryption); + } + + @Override + public boolean reachedEnd() { + if (context.limit() > 0 && currentReadCount >= context.limit()) { + return true; + } else { + return !iterator.hasNext(); + } + } + + @Override + public RowData nextRecord(RowData reuse) { + currentReadCount++; + return iterator.next(); + } + + @Override + public void close() throws IOException { + if (iterator != null) { + iterator.close(); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java new file mode 100644 index 000000000000..16fd4f39596c --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.util.Arrays; +import javax.annotation.Nullable; +import org.apache.flink.core.io.LocatableInputSplit; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; + +public class FlinkInputSplit extends LocatableInputSplit { + + private final CombinedScanTask task; + + FlinkInputSplit(int splitNumber, CombinedScanTask task, @Nullable String[] hostnames) { + super(splitNumber, hostnames); + this.task = task; + } + + CombinedScanTask getTask() { + return task; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("splitNumber", getSplitNumber()) + .add("task", task) + .add("hosts", Arrays.toString(getHostnames())) + .toString(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java new file mode 100644 index 000000000000..e0c99107d549 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java @@ -0,0 +1,318 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.time.Duration; +import java.util.List; +import java.util.Map; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.legacy.SourceFunction; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.legacy.api.TableSchema; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.TableScan; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.FlinkReadOptions; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.PropertyUtil; + +/** + * Flink source builder for old {@link SourceFunction} implementation. + * + * @deprecated since 1.7.0, will be removed in 2.0.0. Use {@link IcebergSource} instead, which + * implement the newer FLIP-27 source interface. This class implements the old {@link + * SourceFunction} that has been marked as deprecated in Flink since Aug 2023. + */ +@Deprecated +public class FlinkSource { + private FlinkSource() {} + + /** + * Initialize a {@link Builder} to read the data from iceberg table. Equivalent to {@link + * TableScan}. See more options in {@link ScanContext}. + * + *

    The Source can be read static data in bounded mode. It can also continuously check the + * arrival of new data and read records incrementally. + * + *

      + *
    • Without startSnapshotId: Bounded + *
    • With startSnapshotId and with endSnapshotId: Bounded + *
    • With startSnapshotId (-1 means unbounded preceding) and Without endSnapshotId: Unbounded + *
    + * + *

    + * + * @return {@link Builder} to connect the iceberg table. + */ + public static Builder forRowData() { + return new Builder(); + } + + /** Source builder to build {@link DataStream}. */ + public static class Builder { + private StreamExecutionEnvironment env; + private Table table; + private TableLoader tableLoader; + private TableSchema projectedSchema; + private ReadableConfig readableConfig = new Configuration(); + private final ScanContext.Builder contextBuilder = ScanContext.builder(); + private Boolean exposeLocality; + + private final Map readOptions = Maps.newHashMap(); + + public Builder tableLoader(TableLoader newLoader) { + this.tableLoader = newLoader; + return this; + } + + public Builder table(Table newTable) { + this.table = newTable; + return this; + } + + public Builder env(StreamExecutionEnvironment newEnv) { + this.env = newEnv; + return this; + } + + public Builder filters(List filters) { + contextBuilder.filters(filters); + return this; + } + + public Builder project(TableSchema schema) { + this.projectedSchema = schema; + return this; + } + + public Builder limit(Long newLimit) { + if (newLimit != null) { + readOptions.put(FlinkReadOptions.LIMIT, Long.toString(newLimit)); + } + return this; + } + + public Builder set(String property, String value) { + readOptions.put(property, value); + return this; + } + + public Builder setAll(Map properties) { + readOptions.putAll(properties); + return this; + } + + /** + * @deprecated Use {@link #setAll} instead. + */ + @Deprecated + public Builder properties(Map properties) { + readOptions.putAll(properties); + return this; + } + + public Builder caseSensitive(boolean caseSensitive) { + readOptions.put(FlinkReadOptions.CASE_SENSITIVE, Boolean.toString(caseSensitive)); + return this; + } + + public Builder snapshotId(Long snapshotId) { + readOptions.put(FlinkReadOptions.SNAPSHOT_ID.key(), Long.toString(snapshotId)); + return this; + } + + public Builder branch(String branch) { + readOptions.put(FlinkReadOptions.BRANCH.key(), branch); + return this; + } + + public Builder tag(String tag) { + readOptions.put(FlinkReadOptions.TAG.key(), tag); + return this; + } + + public Builder startSnapshotId(Long startSnapshotId) { + readOptions.put(FlinkReadOptions.START_SNAPSHOT_ID.key(), Long.toString(startSnapshotId)); + return this; + } + + public Builder endSnapshotId(Long endSnapshotId) { + readOptions.put(FlinkReadOptions.END_SNAPSHOT_ID.key(), Long.toString(endSnapshotId)); + return this; + } + + public Builder startTag(String startTag) { + readOptions.put(FlinkReadOptions.START_TAG.key(), startTag); + return this; + } + + public Builder endTag(String endTag) { + readOptions.put(FlinkReadOptions.END_TAG.key(), endTag); + return this; + } + + public Builder asOfTimestamp(Long asOfTimestamp) { + readOptions.put(FlinkReadOptions.AS_OF_TIMESTAMP.key(), Long.toString(asOfTimestamp)); + return this; + } + + public Builder splitSize(Long splitSize) { + readOptions.put(FlinkReadOptions.SPLIT_SIZE, Long.toString(splitSize)); + return this; + } + + public Builder splitLookback(Integer splitLookback) { + readOptions.put(FlinkReadOptions.SPLIT_LOOKBACK, Integer.toString(splitLookback)); + return this; + } + + public Builder splitOpenFileCost(Long splitOpenFileCost) { + readOptions.put(FlinkReadOptions.SPLIT_FILE_OPEN_COST, Long.toString(splitOpenFileCost)); + return this; + } + + public Builder streaming(boolean streaming) { + readOptions.put(FlinkReadOptions.STREAMING, Boolean.toString(streaming)); + return this; + } + + public Builder exposeLocality(boolean newExposeLocality) { + this.exposeLocality = newExposeLocality; + return this; + } + + public Builder nameMapping(String nameMapping) { + readOptions.put(TableProperties.DEFAULT_NAME_MAPPING, nameMapping); + return this; + } + + public Builder monitorInterval(Duration interval) { + readOptions.put(FlinkReadOptions.MONITOR_INTERVAL, interval.toNanos() + " ns"); + return this; + } + + public Builder maxPlanningSnapshotCount(int newMaxPlanningSnapshotCount) { + readOptions.put( + FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT, + Integer.toString(newMaxPlanningSnapshotCount)); + return this; + } + + public Builder flinkConf(ReadableConfig config) { + this.readableConfig = config; + return this; + } + + public FlinkInputFormat buildFormat() { + Preconditions.checkNotNull(tableLoader, "TableLoader should not be null"); + + Schema icebergSchema; + FileIO io; + EncryptionManager encryption; + if (table == null) { + // load required fields by table loader. + tableLoader.open(); + try (TableLoader loader = tableLoader) { + table = loader.loadTable(); + icebergSchema = table.schema(); + io = table.io(); + encryption = table.encryption(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } else { + icebergSchema = table.schema(); + io = table.io(); + encryption = table.encryption(); + } + + if (projectedSchema == null) { + contextBuilder.project(icebergSchema); + } else { + contextBuilder.project(FlinkSchemaUtil.convert(icebergSchema, projectedSchema)); + } + + contextBuilder.exposeLocality( + SourceUtil.isLocalityEnabled(table, readableConfig, exposeLocality)); + contextBuilder.planParallelism( + readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE)); + + contextBuilder.resolveConfig(table, readOptions, readableConfig); + + ScanContext context = contextBuilder.build(); + context.validate(); + return new FlinkInputFormat(tableLoader, icebergSchema, io, encryption, context); + } + + public DataStream build() { + Preconditions.checkNotNull(env, "StreamExecutionEnvironment should not be null"); + FlinkInputFormat format = buildFormat(); + + ScanContext context = contextBuilder.build(); + TypeInformation typeInfo = + FlinkCompatibilityUtil.toTypeInfo(FlinkSchemaUtil.convert(context.project())); + + if (!context.isStreaming()) { + int parallelism = + SourceUtil.inferParallelism( + readableConfig, + context.limit(), + () -> { + try { + return format.createInputSplits(0).length; + } catch (IOException e) { + throw new UncheckedIOException( + "Failed to create iceberg input splits for table: " + table, e); + } + }); + if (env.getMaxParallelism() > 0) { + parallelism = Math.min(parallelism, env.getMaxParallelism()); + } + return env.createInput(format, typeInfo).setParallelism(parallelism); + } else { + StreamingMonitorFunction function = new StreamingMonitorFunction(tableLoader, context); + + String monitorFunctionName = String.format("Iceberg table (%s) monitor", table); + String readerOperatorName = String.format("Iceberg table (%s) reader", table); + + return env.addSource(function, monitorFunctionName) + .transform(readerOperatorName, typeInfo, StreamingReaderOperator.factory(format)); + } + } + } + + public static boolean isBounded(Map properties) { + return !PropertyUtil.propertyAsBoolean(properties, FlinkReadOptions.STREAMING, false); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java new file mode 100644 index 000000000000..15078809714f --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.List; +import java.util.concurrent.ExecutorService; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.IncrementalAppendScan; +import org.apache.iceberg.Scan; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.TableScan; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.hadoop.Util; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.Tasks; + +@Internal +public class FlinkSplitPlanner { + private FlinkSplitPlanner() {} + + static FlinkInputSplit[] planInputSplits( + Table table, ScanContext context, ExecutorService workerPool) { + try (CloseableIterable tasksIterable = + planTasks(table, context, workerPool)) { + List tasks = Lists.newArrayList(tasksIterable); + FlinkInputSplit[] splits = new FlinkInputSplit[tasks.size()]; + boolean exposeLocality = context.exposeLocality(); + + Tasks.range(tasks.size()) + .stopOnFailure() + .executeWith(exposeLocality ? workerPool : null) + .run( + index -> { + CombinedScanTask task = tasks.get(index); + String[] hostnames = null; + if (exposeLocality) { + hostnames = Util.blockLocations(table.io(), task); + } + splits[index] = new FlinkInputSplit(index, task, hostnames); + }); + return splits; + } catch (IOException e) { + throw new UncheckedIOException("Failed to process tasks iterable", e); + } + } + + /** This returns splits for the FLIP-27 source */ + public static List planIcebergSourceSplits( + Table table, ScanContext context, ExecutorService workerPool) { + try (CloseableIterable tasksIterable = + planTasks(table, context, workerPool)) { + return Lists.newArrayList( + CloseableIterable.transform(tasksIterable, IcebergSourceSplit::fromCombinedScanTask)); + } catch (IOException e) { + throw new UncheckedIOException("Failed to process task iterable: ", e); + } + } + + static CloseableIterable planTasks( + Table table, ScanContext context, ExecutorService workerPool) { + ScanMode scanMode = checkScanMode(context); + if (scanMode == ScanMode.INCREMENTAL_APPEND_SCAN) { + IncrementalAppendScan scan = table.newIncrementalAppendScan(); + scan = refineScanWithBaseConfigs(scan, context, workerPool); + + if (context.startTag() != null) { + Preconditions.checkArgument( + table.snapshot(context.startTag()) != null, + "Cannot find snapshot with tag %s", + context.startTag()); + scan = scan.fromSnapshotExclusive(table.snapshot(context.startTag()).snapshotId()); + } + + if (context.startSnapshotId() != null) { + Preconditions.checkArgument( + context.startTag() == null, "START_SNAPSHOT_ID and START_TAG cannot both be set"); + scan = scan.fromSnapshotExclusive(context.startSnapshotId()); + } + + if (context.endTag() != null) { + Preconditions.checkArgument( + table.snapshot(context.endTag()) != null, + "Cannot find snapshot with tag %s", + context.endTag()); + scan = scan.toSnapshot(table.snapshot(context.endTag()).snapshotId()); + } + + if (context.endSnapshotId() != null) { + Preconditions.checkArgument( + context.endTag() == null, "END_SNAPSHOT_ID and END_TAG cannot both be set"); + scan = scan.toSnapshot(context.endSnapshotId()); + } + + return scan.planTasks(); + } else { + TableScan scan = table.newScan(); + scan = refineScanWithBaseConfigs(scan, context, workerPool); + + if (context.snapshotId() != null) { + scan = scan.useSnapshot(context.snapshotId()); + } else if (context.tag() != null) { + scan = scan.useRef(context.tag()); + } else if (context.branch() != null) { + scan = scan.useRef(context.branch()); + } + + if (context.asOfTimestamp() != null) { + scan = scan.asOfTime(context.asOfTimestamp()); + } + + return scan.planTasks(); + } + } + + @VisibleForTesting + enum ScanMode { + BATCH, + INCREMENTAL_APPEND_SCAN + } + + @VisibleForTesting + static ScanMode checkScanMode(ScanContext context) { + if (context.startSnapshotId() != null + || context.endSnapshotId() != null + || context.startTag() != null + || context.endTag() != null) { + return ScanMode.INCREMENTAL_APPEND_SCAN; + } else { + return ScanMode.BATCH; + } + } + + /** refine scan with common configs */ + private static > T refineScanWithBaseConfigs( + T scan, ScanContext context, ExecutorService workerPool) { + T refinedScan = + scan.caseSensitive(context.caseSensitive()).project(context.project()).planWith(workerPool); + + if (context.includeColumnStats()) { + refinedScan = refinedScan.includeColumnStats(); + } + + if (context.includeStatsForColumns() != null) { + refinedScan = refinedScan.includeColumnStats(context.includeStatsForColumns()); + } + + refinedScan = refinedScan.option(TableProperties.SPLIT_SIZE, context.splitSize().toString()); + + refinedScan = + refinedScan.option(TableProperties.SPLIT_LOOKBACK, context.splitLookback().toString()); + + refinedScan = + refinedScan.option( + TableProperties.SPLIT_OPEN_FILE_COST, context.splitOpenFileCost().toString()); + + if (context.filters() != null) { + for (Expression filter : context.filters()) { + refinedScan = refinedScan.filter(filter); + } + } + + return refinedScan; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java new file mode 100644 index 000000000000..ec7cb010b6be --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java @@ -0,0 +1,702 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.time.Duration; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; +import javax.annotation.Nullable; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.source.Boundedness; +import org.apache.flink.api.connector.source.Source; +import org.apache.flink.api.connector.source.SourceReader; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.api.connector.source.SplitEnumerator; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSource; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.legacy.api.TableSchema; +import org.apache.iceberg.BaseMetadataTable; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.FlinkReadConf; +import org.apache.iceberg.flink.FlinkReadOptions; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.source.assigner.OrderedSplitAssignerFactory; +import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; +import org.apache.iceberg.flink.source.assigner.SplitAssigner; +import org.apache.iceberg.flink.source.assigner.SplitAssignerFactory; +import org.apache.iceberg.flink.source.enumerator.ContinuousIcebergEnumerator; +import org.apache.iceberg.flink.source.enumerator.ContinuousSplitPlanner; +import org.apache.iceberg.flink.source.enumerator.ContinuousSplitPlannerImpl; +import org.apache.iceberg.flink.source.enumerator.IcebergEnumeratorState; +import org.apache.iceberg.flink.source.enumerator.IcebergEnumeratorStateSerializer; +import org.apache.iceberg.flink.source.enumerator.StaticIcebergEnumerator; +import org.apache.iceberg.flink.source.reader.ColumnStatsWatermarkExtractor; +import org.apache.iceberg.flink.source.reader.ConverterReaderFunction; +import org.apache.iceberg.flink.source.reader.IcebergSourceReader; +import org.apache.iceberg.flink.source.reader.IcebergSourceReaderMetrics; +import org.apache.iceberg.flink.source.reader.MetaDataReaderFunction; +import org.apache.iceberg.flink.source.reader.ReaderFunction; +import org.apache.iceberg.flink.source.reader.RowDataConverter; +import org.apache.iceberg.flink.source.reader.RowDataReaderFunction; +import org.apache.iceberg.flink.source.reader.SerializableRecordEmitter; +import org.apache.iceberg.flink.source.reader.SplitWatermarkExtractor; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitSerializer; +import org.apache.iceberg.flink.source.split.SerializableComparator; +import org.apache.iceberg.flink.source.split.SplitComparators; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.util.ThreadPools; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class IcebergSource implements Source { + private static final Logger LOG = LoggerFactory.getLogger(IcebergSource.class); + + // This table loader can be closed, and it is only safe to use this instance for resource + // independent information (e.g. a table name). Copies of this are required to avoid lifecycle + // management conflicts with the user provided table loader. e.g. a copy of this is required for + // split planning, which uses the underlying io, and should be closed after split planning is + // complete. + private final TableLoader tableLoader; + private final ScanContext scanContext; + private final ReaderFunction readerFunction; + private final SplitAssignerFactory assignerFactory; + private final SerializableComparator splitComparator; + private final SerializableRecordEmitter emitter; + private final String tableName; + + // cache the discovered splits by planSplitsForBatch, which can be called twice. And they come + // from two different threads: (1) source/stream construction by main thread (2) enumerator + // creation. Hence need volatile here. + private volatile List batchSplits; + + IcebergSource( + TableLoader tableLoader, + ScanContext scanContext, + ReaderFunction readerFunction, + SplitAssignerFactory assignerFactory, + SerializableComparator splitComparator, + Table table, + SerializableRecordEmitter emitter) { + Preconditions.checkNotNull(tableLoader, "tableLoader is required."); + Preconditions.checkNotNull(readerFunction, "readerFunction is required."); + Preconditions.checkNotNull(assignerFactory, "assignerFactory is required."); + Preconditions.checkNotNull(table, "table is required."); + this.tableLoader = tableLoader; + this.scanContext = scanContext; + this.readerFunction = readerFunction; + this.assignerFactory = assignerFactory; + this.splitComparator = splitComparator; + this.emitter = emitter; + this.tableName = table.name(); + } + + String name() { + return "IcebergSource-" + tableName; + } + + private String planningThreadName() { + // Ideally, operatorId should be used as the threadPoolName as Flink guarantees its uniqueness + // within a job. SplitEnumeratorContext doesn't expose the OperatorCoordinator.Context, which + // would contain the OperatorID. Need to discuss with Flink community whether it is ok to expose + // a public API like the protected method "OperatorCoordinator.Context getCoordinatorContext()" + // from SourceCoordinatorContext implementation. For now,

  • - is used as + // the unique thread pool name. + return tableName + "-" + UUID.randomUUID(); + } + + /** + * Cache the enumerated splits for batch execution to avoid double planning as there are two code + * paths obtaining splits: (1) infer parallelism (2) enumerator creation. + */ + private List planSplitsForBatch(String threadName) { + if (batchSplits != null) { + return batchSplits; + } + + ExecutorService workerPool = + ThreadPools.newFixedThreadPool(threadName, scanContext.planParallelism()); + try (TableLoader loader = tableLoader.clone()) { + loader.open(); + this.batchSplits = + FlinkSplitPlanner.planIcebergSourceSplits(loader.loadTable(), scanContext, workerPool); + LOG.info( + "Discovered {} splits from table {} during job initialization", + batchSplits.size(), + tableName); + return batchSplits; + } catch (IOException e) { + throw new UncheckedIOException("Failed to close table loader", e); + } finally { + workerPool.shutdown(); + } + } + + @Override + public Boundedness getBoundedness() { + return scanContext.isStreaming() ? Boundedness.CONTINUOUS_UNBOUNDED : Boundedness.BOUNDED; + } + + @Override + public SourceReader createReader(SourceReaderContext readerContext) { + IcebergSourceReaderMetrics metrics = + new IcebergSourceReaderMetrics(readerContext.metricGroup(), tableName); + return new IcebergSourceReader<>( + emitter, metrics, readerFunction, splitComparator, readerContext); + } + + @Override + public SplitEnumerator createEnumerator( + SplitEnumeratorContext enumContext) { + return createEnumerator(enumContext, null); + } + + @Override + public SplitEnumerator restoreEnumerator( + SplitEnumeratorContext enumContext, IcebergEnumeratorState enumState) { + return createEnumerator(enumContext, enumState); + } + + @Override + public SimpleVersionedSerializer getSplitSerializer() { + return new IcebergSourceSplitSerializer(scanContext.caseSensitive()); + } + + @Override + public SimpleVersionedSerializer getEnumeratorCheckpointSerializer() { + return new IcebergEnumeratorStateSerializer(scanContext.caseSensitive()); + } + + private SplitEnumerator createEnumerator( + SplitEnumeratorContext enumContext, + @Nullable IcebergEnumeratorState enumState) { + SplitAssigner assigner; + if (enumState == null) { + assigner = assignerFactory.createAssigner(); + } else { + LOG.info( + "Iceberg source restored {} splits from state for table {}", + enumState.pendingSplits().size(), + tableName); + assigner = assignerFactory.createAssigner(enumState.pendingSplits()); + } + if (scanContext.isStreaming()) { + ContinuousSplitPlanner splitPlanner = + new ContinuousSplitPlannerImpl(tableLoader, scanContext, planningThreadName()); + return new ContinuousIcebergEnumerator( + enumContext, assigner, scanContext, splitPlanner, enumState); + } else { + if (enumState == null) { + // Only do scan planning if nothing is restored from checkpoint state + List splits = planSplitsForBatch(planningThreadName()); + assigner.onDiscoveredSplits(splits); + // clear the cached splits after enumerator creation as they won't be needed anymore + this.batchSplits = null; + } + + return new StaticIcebergEnumerator(enumContext, assigner); + } + } + + private boolean shouldInferParallelism() { + return !scanContext.isStreaming(); + } + + private int inferParallelism(ReadableConfig flinkConf, StreamExecutionEnvironment env) { + int parallelism = + SourceUtil.inferParallelism( + flinkConf, + scanContext.limit(), + () -> { + List splits = planSplitsForBatch(planningThreadName()); + return splits.size(); + }); + + if (env.getMaxParallelism() > 0) { + parallelism = Math.min(parallelism, env.getMaxParallelism()); + } + + return parallelism; + } + + /** + * Create a source builder. + * + * @deprecated since 1.7.0. Will be removed in 2.0.0; use{@link IcebergSource#forRowData()} or + * {@link IcebergSource#forOutputType(RowDataConverter)} instead + */ + @Deprecated + public static Builder builder() { + return new Builder<>(); + } + + /** Create a source builder for RowData output type. */ + public static Builder forRowData() { + return new Builder<>(); + } + + /** + * Create a source builder that would convert {@link RowData} to the output type {@code T}. + * + * @param converter convert {@link RowData} to output type {@code T} + * @param output type + * @return an IcebergSource builder + */ + public static Builder forOutputType(RowDataConverter converter) { + return new Builder().converter(converter); + } + + public static class Builder { + private TableLoader tableLoader; + private Table table; + private SplitAssignerFactory splitAssignerFactory; + private SerializableComparator splitComparator; + private ReaderFunction readerFunction; + private RowDataConverter converter; + private ReadableConfig flinkConfig = new Configuration(); + private final ScanContext.Builder contextBuilder = ScanContext.builder(); + private TableSchema projectedTableSchema; + private ResolvedSchema projectedFlinkSchema; + private Boolean exposeLocality; + + private final Map readOptions = Maps.newHashMap(); + + Builder() {} + + public Builder tableLoader(TableLoader loader) { + this.tableLoader = loader; + return this; + } + + public Builder table(Table newTable) { + this.table = newTable; + return this; + } + + public Builder assignerFactory(SplitAssignerFactory assignerFactory) { + this.splitAssignerFactory = assignerFactory; + return this; + } + + public Builder splitComparator( + SerializableComparator newSplitComparator) { + this.splitComparator = newSplitComparator; + return this; + } + + /** + * @deprecated since 1.7.0. Will be removed in 2.0.0; use{@link + * IcebergSource#forOutputType(RowDataConverter)} instead to produce output type other than + * {@link RowData}. + */ + @Deprecated + public Builder readerFunction(ReaderFunction newReaderFunction) { + Preconditions.checkState( + converter == null, + "Cannot set reader function when builder was created via IcebergSource.forOutputType(Converter)"); + this.readerFunction = newReaderFunction; + return this; + } + + /** + * Don't need to be public. It is set by {@link IcebergSource#forOutputType(RowDataConverter)}. + */ + private Builder converter(RowDataConverter newConverter) { + this.converter = newConverter; + return this; + } + + public Builder flinkConfig(ReadableConfig config) { + this.flinkConfig = config; + return this; + } + + public Builder caseSensitive(boolean newCaseSensitive) { + readOptions.put(FlinkReadOptions.CASE_SENSITIVE, Boolean.toString(newCaseSensitive)); + return this; + } + + public Builder useSnapshotId(Long newSnapshotId) { + if (newSnapshotId != null) { + readOptions.put(FlinkReadOptions.SNAPSHOT_ID.key(), Long.toString(newSnapshotId)); + } + return this; + } + + public Builder streamingStartingStrategy(StreamingStartingStrategy newStartingStrategy) { + readOptions.put(FlinkReadOptions.STARTING_STRATEGY, newStartingStrategy.name()); + return this; + } + + public Builder startSnapshotTimestamp(Long newStartSnapshotTimestamp) { + if (newStartSnapshotTimestamp != null) { + readOptions.put( + FlinkReadOptions.START_SNAPSHOT_TIMESTAMP.key(), + Long.toString(newStartSnapshotTimestamp)); + } + return this; + } + + public Builder startSnapshotId(Long newStartSnapshotId) { + if (newStartSnapshotId != null) { + readOptions.put( + FlinkReadOptions.START_SNAPSHOT_ID.key(), Long.toString(newStartSnapshotId)); + } + return this; + } + + public Builder tag(String tag) { + readOptions.put(FlinkReadOptions.TAG.key(), tag); + return this; + } + + public Builder branch(String branch) { + readOptions.put(FlinkReadOptions.BRANCH.key(), branch); + return this; + } + + public Builder startTag(String startTag) { + readOptions.put(FlinkReadOptions.START_TAG.key(), startTag); + return this; + } + + public Builder endTag(String endTag) { + readOptions.put(FlinkReadOptions.END_TAG.key(), endTag); + return this; + } + + public Builder endSnapshotId(Long newEndSnapshotId) { + if (newEndSnapshotId != null) { + readOptions.put(FlinkReadOptions.END_SNAPSHOT_ID.key(), Long.toString(newEndSnapshotId)); + } + return this; + } + + public Builder asOfTimestamp(Long newAsOfTimestamp) { + if (newAsOfTimestamp != null) { + readOptions.put(FlinkReadOptions.AS_OF_TIMESTAMP.key(), Long.toString(newAsOfTimestamp)); + } + return this; + } + + public Builder splitSize(Long newSplitSize) { + if (newSplitSize != null) { + readOptions.put(FlinkReadOptions.SPLIT_SIZE, Long.toString(newSplitSize)); + } + return this; + } + + public Builder splitLookback(Integer newSplitLookback) { + if (newSplitLookback != null) { + readOptions.put(FlinkReadOptions.SPLIT_LOOKBACK, Integer.toString(newSplitLookback)); + } + return this; + } + + public Builder splitOpenFileCost(Long newSplitOpenFileCost) { + if (newSplitOpenFileCost != null) { + readOptions.put(FlinkReadOptions.SPLIT_FILE_OPEN_COST, Long.toString(newSplitOpenFileCost)); + } + + return this; + } + + public Builder streaming(boolean streaming) { + readOptions.put(FlinkReadOptions.STREAMING, Boolean.toString(streaming)); + return this; + } + + public Builder monitorInterval(Duration newMonitorInterval) { + if (newMonitorInterval != null) { + readOptions.put(FlinkReadOptions.MONITOR_INTERVAL, newMonitorInterval.toNanos() + " ns"); + } + return this; + } + + public Builder nameMapping(String newNameMapping) { + readOptions.put(TableProperties.DEFAULT_NAME_MAPPING, newNameMapping); + return this; + } + + public Builder project(Schema newProjectedSchema) { + this.contextBuilder.project(newProjectedSchema); + return this; + } + + /** + * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link #project(ResolvedSchema)} + * instead. + */ + @Deprecated + public Builder project(TableSchema newProjectedFlinkSchema) { + this.projectedTableSchema = newProjectedFlinkSchema; + return this; + } + + public Builder project(ResolvedSchema newProjectedFlinkSchema) { + this.projectedFlinkSchema = newProjectedFlinkSchema; + return this; + } + + public Builder filters(List newFilters) { + this.contextBuilder.filters(newFilters); + return this; + } + + public Builder limit(Long newLimit) { + if (newLimit != null) { + readOptions.put(FlinkReadOptions.LIMIT, Long.toString(newLimit)); + } + return this; + } + + public Builder includeColumnStats(boolean newIncludeColumnStats) { + readOptions.put( + FlinkReadOptions.INCLUDE_COLUMN_STATS, Boolean.toString(newIncludeColumnStats)); + return this; + } + + public Builder planParallelism(int planParallelism) { + readOptions.put( + FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.key(), + Integer.toString(planParallelism)); + return this; + } + + public Builder exposeLocality(boolean newExposeLocality) { + this.exposeLocality = newExposeLocality; + return this; + } + + public Builder maxAllowedPlanningFailures(int maxAllowedPlanningFailures) { + readOptions.put( + FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION.key(), + Integer.toString(maxAllowedPlanningFailures)); + return this; + } + + /** + * Set the read properties for Flink source. View the supported properties in {@link + * FlinkReadOptions} + */ + public Builder set(String property, String value) { + readOptions.put(property, value); + return this; + } + + /** + * Set the read properties for Flink source. View the supported properties in {@link + * FlinkReadOptions} + */ + public Builder setAll(Map properties) { + readOptions.putAll(properties); + return this; + } + + /** + * Emits watermarks once per split based on the min value of column statistics from files + * metadata in the given split. The generated watermarks are also used for ordering the splits + * for read. Accepted column types are timestamp/timestamptz/long. For long columns consider + * setting {@link #watermarkColumnTimeUnit(TimeUnit)}. + * + *

    Consider setting `read.split.open-file-cost` to prevent combining small files to a single + * split when the watermark is used for watermark alignment. + */ + public Builder watermarkColumn(String columnName) { + Preconditions.checkArgument( + splitAssignerFactory == null, + "Watermark column and SplitAssigner should not be set in the same source"); + readOptions.put(FlinkReadOptions.WATERMARK_COLUMN, columnName); + return this; + } + + /** + * When the type of the {@link #watermarkColumn} is {@link + * org.apache.iceberg.types.Types.LongType}, then sets the {@link TimeUnit} to convert the + * value. The default value is {@link TimeUnit#MICROSECONDS}. + */ + public Builder watermarkColumnTimeUnit(TimeUnit timeUnit) { + readOptions.put(FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT, timeUnit.name()); + return this; + } + + /** + * @deprecated will be removed in 2.0.0; use {@link #setAll} instead. + */ + @Deprecated + public Builder properties(Map properties) { + readOptions.putAll(properties); + return this; + } + + public IcebergSource build() { + if (table == null) { + try (TableLoader loader = tableLoader) { + loader.open(); + this.table = tableLoader.loadTable(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + contextBuilder.resolveConfig(table, readOptions, flinkConfig); + contextBuilder.exposeLocality( + SourceUtil.isLocalityEnabled(table, flinkConfig, exposeLocality)); + contextBuilder.planParallelism( + flinkConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE)); + Schema icebergSchema = table.schema(); + if (projectedFlinkSchema != null) { + contextBuilder.project(FlinkSchemaUtil.convert(icebergSchema, projectedFlinkSchema)); + } else if (projectedTableSchema != null) { + contextBuilder.project(FlinkSchemaUtil.convert(icebergSchema, projectedTableSchema)); + } + + SerializableRecordEmitter emitter = SerializableRecordEmitter.defaultEmitter(); + FlinkReadConf flinkReadConf = new FlinkReadConf(table, readOptions, flinkConfig); + String watermarkColumn = flinkReadConf.watermarkColumn(); + TimeUnit watermarkTimeUnit = flinkReadConf.watermarkColumnTimeUnit(); + + if (watermarkColumn != null) { + // Column statistics is needed for watermark generation + contextBuilder.includeColumnStats(Sets.newHashSet(watermarkColumn)); + + SplitWatermarkExtractor watermarkExtractor = + new ColumnStatsWatermarkExtractor(icebergSchema, watermarkColumn, watermarkTimeUnit); + emitter = SerializableRecordEmitter.emitterWithWatermark(watermarkExtractor); + splitAssignerFactory = + new OrderedSplitAssignerFactory(SplitComparators.watermark(watermarkExtractor)); + } + + ScanContext context = contextBuilder.build(); + context.validate(); + if (readerFunction == null) { + this.readerFunction = readerFunction(context); + } + + if (splitAssignerFactory == null) { + if (splitComparator == null) { + splitAssignerFactory = new SimpleSplitAssignerFactory(); + } else { + splitAssignerFactory = new OrderedSplitAssignerFactory(splitComparator); + } + } + + // Since builder already load the table, pass it to the source to avoid double loading + return new IcebergSource<>( + tableLoader, + context, + readerFunction, + splitAssignerFactory, + splitComparator, + table, + emitter); + } + + /** + * Build the {@link IcebergSource} and create a {@link DataStream} from the source. Watermark + * strategy is set to {@link WatermarkStrategy#noWatermarks()}. + * + * @return data stream from the Iceberg source + */ + public DataStream buildStream(StreamExecutionEnvironment env) { + // buildStream should only be called with RowData or Converter paths. + Preconditions.checkState( + readerFunction == null, + "Cannot set reader function when building a data stream from the source"); + IcebergSource source = build(); + TypeInformation outputTypeInfo = + outputTypeInfo(converter, table.schema(), source.scanContext.project()); + DataStreamSource stream = + env.fromSource(source, WatermarkStrategy.noWatermarks(), source.name(), outputTypeInfo); + if (source.shouldInferParallelism()) { + stream = stream.setParallelism(source.inferParallelism(flinkConfig, env)); + } + + return stream; + } + + private static TypeInformation outputTypeInfo( + RowDataConverter converter, Schema tableSchema, Schema projected) { + if (converter != null) { + return converter.getProducedType(); + } else { + // output type is RowData + Schema readSchema = projected != null ? projected : tableSchema; + return (TypeInformation) + FlinkCompatibilityUtil.toTypeInfo(FlinkSchemaUtil.convert(readSchema)); + } + } + + private ReaderFunction readerFunction(ScanContext context) { + if (table instanceof BaseMetadataTable) { + MetaDataReaderFunction rowDataReaderFunction = + new MetaDataReaderFunction( + flinkConfig, table.schema(), context.project(), table.io(), table.encryption()); + return (ReaderFunction) rowDataReaderFunction; + } else { + if (converter == null) { + return (ReaderFunction) + new RowDataReaderFunction( + flinkConfig, + table.schema(), + context.project(), + context.nameMapping(), + context.caseSensitive(), + table.io(), + table.encryption(), + context.filters(), + context.limit()); + } else { + return new ConverterReaderFunction<>( + converter, + flinkConfig, + table.schema(), + context.project(), + context.nameMapping(), + context.caseSensitive(), + table.io(), + table.encryption(), + context.filters(), + context.limit()); + } + } + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java new file mode 100644 index 000000000000..aeecd43e7f14 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; +import org.apache.flink.annotation.Internal; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.table.connector.ProviderContext; +import org.apache.flink.table.connector.source.DataStreamScanProvider; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.connector.source.ScanTableSource; +import org.apache.flink.table.connector.source.abilities.SupportsFilterPushDown; +import org.apache.flink.table.connector.source.abilities.SupportsLimitPushDown; +import org.apache.flink.table.connector.source.abilities.SupportsProjectionPushDown; +import org.apache.flink.table.connector.source.abilities.SupportsSourceWatermark; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.flink.table.factories.FactoryUtil; +import org.apache.flink.table.legacy.api.TableSchema; +import org.apache.flink.table.types.DataType; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.FlinkFilters; +import org.apache.iceberg.flink.FlinkReadOptions; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.source.assigner.SplitAssignerType; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.PropertyUtil; + +/** Flink Iceberg table source. */ +@Internal +public class IcebergTableSource + implements ScanTableSource, + SupportsProjectionPushDown, + SupportsFilterPushDown, + SupportsLimitPushDown, + SupportsSourceWatermark { + + private int[] projectedFields; + private Long limit; + private List filters; + + private final TableLoader loader; + private final ResolvedSchema schema; + private final Map properties; + private final boolean isLimitPushDown; + private final ReadableConfig readableConfig; + + private IcebergTableSource(IcebergTableSource toCopy) { + this.loader = toCopy.loader; + this.schema = toCopy.schema; + this.properties = toCopy.properties; + this.projectedFields = toCopy.projectedFields; + this.isLimitPushDown = toCopy.isLimitPushDown; + this.limit = toCopy.limit; + this.filters = toCopy.filters; + this.readableConfig = toCopy.readableConfig; + } + + public IcebergTableSource( + TableLoader loader, + ResolvedSchema schema, + Map properties, + ReadableConfig readableConfig) { + this(loader, schema, properties, null, false, null, ImmutableList.of(), readableConfig); + } + + private IcebergTableSource( + TableLoader loader, + ResolvedSchema schema, + Map properties, + int[] projectedFields, + boolean isLimitPushDown, + Long limit, + List filters, + ReadableConfig readableConfig) { + this.loader = loader; + this.schema = schema; + this.properties = properties; + this.projectedFields = projectedFields; + this.isLimitPushDown = isLimitPushDown; + this.limit = limit; + this.filters = filters; + this.readableConfig = readableConfig; + } + + @Override + public void applyProjection(int[][] projectFields, DataType producedDataType) { + this.projectedFields = new int[projectFields.length]; + for (int i = 0; i < projectFields.length; i++) { + Preconditions.checkArgument( + projectFields[i].length == 1, "Don't support nested projection in iceberg source now."); + this.projectedFields[i] = projectFields[i][0]; + } + } + + private DataStream createDataStream(StreamExecutionEnvironment execEnv) { + return FlinkSource.forRowData() + .env(execEnv) + .tableLoader(loader) + .setAll(properties) + .project(TableSchema.fromResolvedSchema(getProjectedSchema())) + .limit(limit) + .filters(filters) + .flinkConf(readableConfig) + .build(); + } + + private DataStream createFLIP27Stream(StreamExecutionEnvironment env) { + SplitAssignerType assignerType = + readableConfig.get(FlinkConfigOptions.TABLE_EXEC_SPLIT_ASSIGNER_TYPE); + return IcebergSource.forRowData() + .tableLoader(loader) + .assignerFactory(assignerType.factory()) + .setAll(properties) + .project(getProjectedSchema()) + .limit(limit) + .filters(filters) + .flinkConfig(readableConfig) + .buildStream(env); + } + + private ResolvedSchema getProjectedSchema() { + if (projectedFields == null) { + return schema; + } else { + List fullColumns = schema.getColumns(); + return ResolvedSchema.of( + Arrays.stream(projectedFields).mapToObj(fullColumns::get).collect(Collectors.toList())); + } + } + + @Override + public void applyLimit(long newLimit) { + this.limit = newLimit; + } + + @Override + public Result applyFilters(List flinkFilters) { + List acceptedFilters = Lists.newArrayList(); + List expressions = Lists.newArrayList(); + + for (ResolvedExpression resolvedExpression : flinkFilters) { + Optional icebergExpression = FlinkFilters.convert(resolvedExpression); + if (icebergExpression.isPresent()) { + expressions.add(icebergExpression.get()); + acceptedFilters.add(resolvedExpression); + } + } + + this.filters = expressions; + return Result.of(acceptedFilters, flinkFilters); + } + + @Override + public void applySourceWatermark() { + Preconditions.checkArgument( + readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE), + "Source watermarks are supported only in flip-27 iceberg source implementation"); + + Preconditions.checkNotNull( + properties.get(FlinkReadOptions.WATERMARK_COLUMN), + "watermark-column needs to be configured to use source watermark."); + } + + @Override + public boolean supportsNestedProjection() { + // TODO: support nested projection + return false; + } + + @Override + public ChangelogMode getChangelogMode() { + return ChangelogMode.insertOnly(); + } + + @Override + public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderContext) { + return new DataStreamScanProvider() { + @Override + public DataStream produceDataStream( + ProviderContext providerContext, StreamExecutionEnvironment execEnv) { + if (readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE)) { + return createFLIP27Stream(execEnv); + } else { + return createDataStream(execEnv); + } + } + + @Override + public boolean isBounded() { + return FlinkSource.isBounded(properties); + } + + @Override + public Optional getParallelism() { + return Optional.ofNullable( + PropertyUtil.propertyAsNullableInt(properties, FactoryUtil.SOURCE_PARALLELISM.key())); + } + }; + } + + @Override + public DynamicTableSource copy() { + return new IcebergTableSource(this); + } + + @Override + public String asSummaryString() { + return "Iceberg table source"; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java new file mode 100644 index 000000000000..bf6f72cc287a --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.util.List; +import java.util.Map; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.MetadataColumns; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.data.DeleteFilter; +import org.apache.iceberg.encryption.InputFilesDecryptor; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.FlinkSourceFilter; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.data.FlinkOrcReader; +import org.apache.iceberg.flink.data.FlinkParquetReaders; +import org.apache.iceberg.flink.data.FlinkPlannedAvroReader; +import org.apache.iceberg.flink.data.RowDataProjection; +import org.apache.iceberg.flink.data.RowDataUtil; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.mapping.NameMappingParser; +import org.apache.iceberg.orc.ORC; +import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.PartitionUtil; + +@Internal +public class RowDataFileScanTaskReader implements FileScanTaskReader { + + private final Schema tableSchema; + private final Schema projectedSchema; + private final String nameMapping; + private final boolean caseSensitive; + private final FlinkSourceFilter rowFilter; + + public RowDataFileScanTaskReader( + Schema tableSchema, + Schema projectedSchema, + String nameMapping, + boolean caseSensitive, + List filters) { + this.tableSchema = tableSchema; + this.projectedSchema = projectedSchema; + this.nameMapping = nameMapping; + this.caseSensitive = caseSensitive; + + if (filters != null && !filters.isEmpty()) { + Expression combinedExpression = + filters.stream().reduce(Expressions.alwaysTrue(), Expressions::and); + this.rowFilter = + new FlinkSourceFilter(this.projectedSchema, combinedExpression, this.caseSensitive); + } else { + this.rowFilter = null; + } + } + + @Override + public CloseableIterator open( + FileScanTask task, InputFilesDecryptor inputFilesDecryptor) { + Schema partitionSchema = TypeUtil.select(projectedSchema, task.spec().identitySourceIds()); + + Map idToConstant = + partitionSchema.columns().isEmpty() + ? ImmutableMap.of() + : PartitionUtil.constantsMap(task, RowDataUtil::convertConstant); + + FlinkDeleteFilter deletes = + new FlinkDeleteFilter(task, tableSchema, projectedSchema, inputFilesDecryptor); + CloseableIterable iterable = + deletes.filter( + newIterable(task, deletes.requiredSchema(), idToConstant, inputFilesDecryptor)); + + // Project the RowData to remove the extra meta columns. + if (!projectedSchema.sameSchema(deletes.requiredSchema())) { + RowDataProjection rowDataProjection = + RowDataProjection.create( + deletes.requiredRowType(), + deletes.requiredSchema().asStruct(), + projectedSchema.asStruct()); + iterable = CloseableIterable.transform(iterable, rowDataProjection::wrap); + } + + return iterable.iterator(); + } + + private CloseableIterable newIterable( + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + CloseableIterable iter; + if (task.isDataTask()) { + throw new UnsupportedOperationException("Cannot read data task."); + } else { + switch (task.file().format()) { + case PARQUET: + iter = newParquetIterable(task, schema, idToConstant, inputFilesDecryptor); + break; + + case AVRO: + iter = newAvroIterable(task, schema, idToConstant, inputFilesDecryptor); + break; + + case ORC: + iter = newOrcIterable(task, schema, idToConstant, inputFilesDecryptor); + break; + + default: + throw new UnsupportedOperationException( + "Cannot read unknown format: " + task.file().format()); + } + } + + if (rowFilter != null) { + return CloseableIterable.filter(iter, rowFilter::filter); + } + return iter; + } + + private CloseableIterable newAvroIterable( + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + Avro.ReadBuilder builder = + Avro.read(inputFilesDecryptor.getInputFile(task)) + .reuseContainers() + .project(schema) + .split(task.start(), task.length()) + .createReaderFunc(readSchema -> FlinkPlannedAvroReader.create(schema, idToConstant)); + + if (nameMapping != null) { + builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); + } + + return builder.build(); + } + + private CloseableIterable newParquetIterable( + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + Parquet.ReadBuilder builder = + Parquet.read(inputFilesDecryptor.getInputFile(task)) + .split(task.start(), task.length()) + .project(schema) + .createReaderFunc( + fileSchema -> FlinkParquetReaders.buildReader(schema, fileSchema, idToConstant)) + .filter(task.residual()) + .caseSensitive(caseSensitive) + .reuseContainers(); + + if (nameMapping != null) { + builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); + } + + return builder.build(); + } + + private CloseableIterable newOrcIterable( + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + Schema readSchemaWithoutConstantAndMetadataFields = + TypeUtil.selectNot( + schema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); + + ORC.ReadBuilder builder = + ORC.read(inputFilesDecryptor.getInputFile(task)) + .project(readSchemaWithoutConstantAndMetadataFields) + .split(task.start(), task.length()) + .createReaderFunc( + readOrcSchema -> new FlinkOrcReader(schema, readOrcSchema, idToConstant)) + .filter(task.residual()) + .caseSensitive(caseSensitive); + + if (nameMapping != null) { + builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); + } + + return builder.build(); + } + + private static class FlinkDeleteFilter extends DeleteFilter { + private final RowType requiredRowType; + private final RowDataWrapper asStructLike; + private final InputFilesDecryptor inputFilesDecryptor; + + FlinkDeleteFilter( + FileScanTask task, + Schema tableSchema, + Schema requestedSchema, + InputFilesDecryptor inputFilesDecryptor) { + super(task.file().location(), task.deletes(), tableSchema, requestedSchema); + this.requiredRowType = FlinkSchemaUtil.convert(requiredSchema()); + this.asStructLike = new RowDataWrapper(requiredRowType, requiredSchema().asStruct()); + this.inputFilesDecryptor = inputFilesDecryptor; + } + + public RowType requiredRowType() { + return requiredRowType; + } + + @Override + protected StructLike asStructLike(RowData row) { + return asStructLike.wrap(row); + } + + @Override + protected InputFile getInputFile(String location) { + return inputFilesDecryptor.getInputFile(location); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java new file mode 100644 index 000000000000..d27b2531eec0 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; + +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.api.common.functions.RichMapFunction; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.sink.RowDataTaskWriterFactory; +import org.apache.iceberg.flink.sink.TaskWriterFactory; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.PropertyUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class RowDataRewriter { + + private static final Logger LOG = LoggerFactory.getLogger(RowDataRewriter.class); + + private final Schema schema; + private final String nameMapping; + private final FileIO io; + private final boolean caseSensitive; + private final EncryptionManager encryptionManager; + private final TaskWriterFactory taskWriterFactory; + private final String tableName; + + public RowDataRewriter( + Table table, boolean caseSensitive, FileIO io, EncryptionManager encryptionManager) { + this.schema = table.schema(); + this.caseSensitive = caseSensitive; + this.io = io; + this.encryptionManager = encryptionManager; + this.nameMapping = + PropertyUtil.propertyAsString(table.properties(), DEFAULT_NAME_MAPPING, null); + this.tableName = table.name(); + + String formatString = + PropertyUtil.propertyAsString( + table.properties(), + TableProperties.DEFAULT_FILE_FORMAT, + TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); + FileFormat format = FileFormat.fromString(formatString); + RowType flinkSchema = FlinkSchemaUtil.convert(table.schema()); + this.taskWriterFactory = + new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), + flinkSchema, + Long.MAX_VALUE, + format, + table.properties(), + null, + false); + } + + public List rewriteDataForTasks( + DataStream dataStream, int parallelism) throws Exception { + RewriteMap map = + new RewriteMap( + schema, nameMapping, io, caseSensitive, encryptionManager, taskWriterFactory); + DataStream> ds = dataStream.map(map).setParallelism(parallelism); + return Lists.newArrayList(ds.executeAndCollect("Rewrite table :" + tableName)).stream() + .flatMap(Collection::stream) + .collect(Collectors.toList()); + } + + public static class RewriteMap extends RichMapFunction> { + + private TaskWriter writer; + private int subTaskId; + private int attemptId; + + private final FileIO io; + private final EncryptionManager encryptionManager; + private final TaskWriterFactory taskWriterFactory; + private final RowDataFileScanTaskReader rowDataReader; + + public RewriteMap( + Schema schema, + String nameMapping, + FileIO io, + boolean caseSensitive, + EncryptionManager encryptionManager, + TaskWriterFactory taskWriterFactory) { + this.io = io; + this.encryptionManager = encryptionManager; + this.taskWriterFactory = taskWriterFactory; + this.rowDataReader = + new RowDataFileScanTaskReader( + schema, schema, nameMapping, caseSensitive, Collections.emptyList()); + } + + @Override + public void open(OpenContext parameters) { + this.subTaskId = getRuntimeContext().getTaskInfo().getIndexOfThisSubtask(); + this.attemptId = getRuntimeContext().getTaskInfo().getAttemptNumber(); + // Initialize the task writer factory. + this.taskWriterFactory.initialize(subTaskId, attemptId); + } + + @Override + public List map(CombinedScanTask task) throws Exception { + // Initialize the task writer. + this.writer = taskWriterFactory.create(); + try (DataIterator iterator = + new DataIterator<>(rowDataReader, task, io, encryptionManager)) { + while (iterator.hasNext()) { + RowData rowData = iterator.next(); + writer.write(rowData); + } + return Lists.newArrayList(writer.dataFiles()); + } catch (Throwable originalThrowable) { + try { + LOG.error("Aborting commit for (subTaskId {}, attemptId {})", subTaskId, attemptId); + writer.abort(); + LOG.error("Aborted commit for (subTaskId {}, attemptId {})", subTaskId, attemptId); + } catch (Throwable inner) { + if (originalThrowable != inner) { + originalThrowable.addSuppressed(inner); + LOG.warn("Suppressing exception in catch: {}", inner.getMessage(), inner); + } + } + + if (originalThrowable instanceof Exception) { + throw originalThrowable; + } else { + throw new RuntimeException(originalThrowable); + } + } + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java new file mode 100644 index 000000000000..8ef1f1fbb833 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.Serializable; +import java.util.function.Function; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.flink.annotation.Internal; +import org.apache.flink.formats.avro.RowDataToAvroConverters; +import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.flink.FlinkSchemaUtil; + +/** + * This is not serializable because Avro {@link Schema} is not actually serializable, even though it + * implements {@link Serializable} interface. + */ +@Internal +public class RowDataToAvroGenericRecordConverter implements Function { + private final RowDataToAvroConverters.RowDataToAvroConverter converter; + private final Schema avroSchema; + + private RowDataToAvroGenericRecordConverter(RowType rowType, Schema avroSchema) { + this.converter = RowDataToAvroConverters.createConverter(rowType); + this.avroSchema = avroSchema; + } + + @Override + public GenericRecord apply(RowData rowData) { + return (GenericRecord) converter.convert(avroSchema, rowData); + } + + /** Create a converter based on Iceberg schema */ + public static RowDataToAvroGenericRecordConverter fromIcebergSchema( + String tableName, org.apache.iceberg.Schema icebergSchema) { + RowType rowType = FlinkSchemaUtil.convert(icebergSchema); + Schema avroSchema = AvroSchemaUtil.convert(icebergSchema, tableName); + return new RowDataToAvroGenericRecordConverter(rowType, avroSchema); + } + + /** Create a mapper based on Avro schema */ + public static RowDataToAvroGenericRecordConverter fromAvroSchema(Schema avroSchema) { + DataType dataType = AvroSchemaConverter.convertToDataType(avroSchema.toString()); + LogicalType logicalType = TypeConversions.fromDataToLogicalType(dataType); + RowType rowType = RowType.of(logicalType.getChildren().toArray(new LogicalType[0])); + return new RowDataToAvroGenericRecordConverter(rowType, avroSchema); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java new file mode 100644 index 000000000000..bac7c05bdfef --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java @@ -0,0 +1,597 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.Serializable; +import java.time.Duration; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import org.apache.flink.annotation.Internal; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.util.TimeUtils; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.FlinkReadConf; +import org.apache.iceberg.flink.FlinkReadOptions; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** Context object with optional arguments for a Flink Scan. */ +@Internal +public class ScanContext implements Serializable { + + private static final long serialVersionUID = 1L; + + private final boolean caseSensitive; + private final boolean exposeLocality; + private final Long snapshotId; + private final String branch; + private final String tag; + private final StreamingStartingStrategy startingStrategy; + private final Long startSnapshotId; + private final Long startSnapshotTimestamp; + private final Long endSnapshotId; + private final Long asOfTimestamp; + private final String startTag; + private final String endTag; + private final Long splitSize; + private final Integer splitLookback; + private final Long splitOpenFileCost; + private final boolean isStreaming; + private final Duration monitorInterval; + + private final String nameMapping; + private final Schema schema; + private final List filters; + private final long limit; + private final boolean includeColumnStats; + private final Collection includeStatsForColumns; + private final Integer planParallelism; + private final int maxPlanningSnapshotCount; + private final int maxAllowedPlanningFailures; + private final String watermarkColumn; + private final TimeUnit watermarkColumnTimeUnit; + + private ScanContext( + boolean caseSensitive, + Long snapshotId, + StreamingStartingStrategy startingStrategy, + Long startSnapshotTimestamp, + Long startSnapshotId, + Long endSnapshotId, + Long asOfTimestamp, + Long splitSize, + Integer splitLookback, + Long splitOpenFileCost, + boolean isStreaming, + Duration monitorInterval, + String nameMapping, + Schema schema, + List filters, + long limit, + boolean includeColumnStats, + Collection includeStatsForColumns, + boolean exposeLocality, + Integer planParallelism, + int maxPlanningSnapshotCount, + int maxAllowedPlanningFailures, + String watermarkColumn, + TimeUnit watermarkColumnTimeUnit, + String branch, + String tag, + String startTag, + String endTag) { + this.caseSensitive = caseSensitive; + this.snapshotId = snapshotId; + this.tag = tag; + this.branch = branch; + this.startingStrategy = startingStrategy; + this.startSnapshotTimestamp = startSnapshotTimestamp; + this.startSnapshotId = startSnapshotId; + this.endSnapshotId = endSnapshotId; + this.asOfTimestamp = asOfTimestamp; + this.startTag = startTag; + this.endTag = endTag; + this.splitSize = splitSize; + this.splitLookback = splitLookback; + this.splitOpenFileCost = splitOpenFileCost; + this.isStreaming = isStreaming; + this.monitorInterval = monitorInterval; + + this.nameMapping = nameMapping; + this.schema = schema; + this.filters = filters; + this.limit = limit; + this.includeColumnStats = includeColumnStats; + this.includeStatsForColumns = includeStatsForColumns; + this.exposeLocality = exposeLocality; + this.planParallelism = planParallelism; + this.maxPlanningSnapshotCount = maxPlanningSnapshotCount; + this.maxAllowedPlanningFailures = maxAllowedPlanningFailures; + this.watermarkColumn = watermarkColumn; + this.watermarkColumnTimeUnit = watermarkColumnTimeUnit; + } + + void validate() { + if (isStreaming) { + if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) { + Preconditions.checkArgument( + startSnapshotId != null, + "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); + Preconditions.checkArgument( + startSnapshotTimestamp == null, + "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + } + if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) { + Preconditions.checkArgument( + startSnapshotTimestamp != null, + "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); + Preconditions.checkArgument( + startSnapshotId == null, + "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + } + + Preconditions.checkArgument( + tag == null, + String.format("Cannot scan table using ref %s configured for streaming reader", tag)); + Preconditions.checkArgument( + snapshotId == null, "Cannot set snapshot-id option for streaming reader"); + Preconditions.checkArgument( + asOfTimestamp == null, "Cannot set as-of-timestamp option for streaming reader"); + Preconditions.checkArgument( + endSnapshotId == null, "Cannot set end-snapshot-id option for streaming reader"); + Preconditions.checkArgument(endTag == null, "Cannot set end-tag option for streaming reader"); + } + + Preconditions.checkArgument( + !(startTag != null && startSnapshotId() != null), + "START_SNAPSHOT_ID and START_TAG cannot both be set."); + + Preconditions.checkArgument( + !(endTag != null && endSnapshotId() != null), + "END_SNAPSHOT_ID and END_TAG cannot both be set."); + + Preconditions.checkArgument( + maxAllowedPlanningFailures >= -1, + "Cannot set maxAllowedPlanningFailures to a negative number other than -1."); + } + + public boolean caseSensitive() { + return caseSensitive; + } + + public Long snapshotId() { + return snapshotId; + } + + public String branch() { + return branch; + } + + public String tag() { + return tag; + } + + public String startTag() { + return startTag; + } + + public String endTag() { + return endTag; + } + + public StreamingStartingStrategy streamingStartingStrategy() { + return startingStrategy; + } + + public Long startSnapshotTimestamp() { + return startSnapshotTimestamp; + } + + public Long startSnapshotId() { + return startSnapshotId; + } + + public Long endSnapshotId() { + return endSnapshotId; + } + + public Long asOfTimestamp() { + return asOfTimestamp; + } + + public Long splitSize() { + return splitSize; + } + + public Integer splitLookback() { + return splitLookback; + } + + public Long splitOpenFileCost() { + return splitOpenFileCost; + } + + public boolean isStreaming() { + return isStreaming; + } + + public Duration monitorInterval() { + return monitorInterval; + } + + public String nameMapping() { + return nameMapping; + } + + public Schema project() { + return schema; + } + + public List filters() { + return filters; + } + + public long limit() { + return limit; + } + + public boolean includeColumnStats() { + return includeColumnStats; + } + + public Collection includeStatsForColumns() { + return includeStatsForColumns; + } + + public boolean exposeLocality() { + return exposeLocality; + } + + public Integer planParallelism() { + return planParallelism; + } + + public int maxPlanningSnapshotCount() { + return maxPlanningSnapshotCount; + } + + public int maxAllowedPlanningFailures() { + return maxAllowedPlanningFailures; + } + + public String watermarkColumn() { + return watermarkColumn; + } + + public TimeUnit watermarkColumnTimeUnit() { + return watermarkColumnTimeUnit; + } + + public ScanContext copyWithAppendsBetween(Long newStartSnapshotId, long newEndSnapshotId) { + return ScanContext.builder() + .caseSensitive(caseSensitive) + .useSnapshotId(null) + .useBranch(branch) + .useTag(null) + .startSnapshotId(newStartSnapshotId) + .endSnapshotId(newEndSnapshotId) + .startTag(null) + .endTag(null) + .asOfTimestamp(null) + .splitSize(splitSize) + .splitLookback(splitLookback) + .splitOpenFileCost(splitOpenFileCost) + .streaming(isStreaming) + .monitorInterval(monitorInterval) + .nameMapping(nameMapping) + .project(schema) + .filters(filters) + .limit(limit) + .includeColumnStats(includeColumnStats) + .includeColumnStats(includeStatsForColumns) + .exposeLocality(exposeLocality) + .planParallelism(planParallelism) + .maxPlanningSnapshotCount(maxPlanningSnapshotCount) + .maxAllowedPlanningFailures(maxAllowedPlanningFailures) + .watermarkColumn(watermarkColumn) + .watermarkColumnTimeUnit(watermarkColumnTimeUnit) + .build(); + } + + public ScanContext copyWithSnapshotId(long newSnapshotId) { + return ScanContext.builder() + .caseSensitive(caseSensitive) + .useSnapshotId(newSnapshotId) + .useBranch(branch) + .useTag(tag) + .startSnapshotId(null) + .endSnapshotId(null) + .startTag(null) + .endTag(null) + .asOfTimestamp(null) + .splitSize(splitSize) + .splitLookback(splitLookback) + .splitOpenFileCost(splitOpenFileCost) + .streaming(isStreaming) + .monitorInterval(monitorInterval) + .nameMapping(nameMapping) + .project(schema) + .filters(filters) + .limit(limit) + .includeColumnStats(includeColumnStats) + .includeColumnStats(includeStatsForColumns) + .exposeLocality(exposeLocality) + .planParallelism(planParallelism) + .maxPlanningSnapshotCount(maxPlanningSnapshotCount) + .maxAllowedPlanningFailures(maxAllowedPlanningFailures) + .watermarkColumn(watermarkColumn) + .watermarkColumnTimeUnit(watermarkColumnTimeUnit) + .build(); + } + + public static Builder builder() { + return new Builder(); + } + + public static class Builder { + private boolean caseSensitive = FlinkReadOptions.CASE_SENSITIVE_OPTION.defaultValue(); + private Long snapshotId = FlinkReadOptions.SNAPSHOT_ID.defaultValue(); + private String branch = FlinkReadOptions.BRANCH.defaultValue(); + private String tag = FlinkReadOptions.TAG.defaultValue(); + private String startTag = FlinkReadOptions.START_TAG.defaultValue(); + private String endTag = FlinkReadOptions.END_TAG.defaultValue(); + private StreamingStartingStrategy startingStrategy = + FlinkReadOptions.STARTING_STRATEGY_OPTION.defaultValue(); + private Long startSnapshotTimestamp = FlinkReadOptions.START_SNAPSHOT_TIMESTAMP.defaultValue(); + private Long startSnapshotId = FlinkReadOptions.START_SNAPSHOT_ID.defaultValue(); + private Long endSnapshotId = FlinkReadOptions.END_SNAPSHOT_ID.defaultValue(); + private Long asOfTimestamp = FlinkReadOptions.AS_OF_TIMESTAMP.defaultValue(); + private Long splitSize = FlinkReadOptions.SPLIT_SIZE_OPTION.defaultValue(); + private Integer splitLookback = FlinkReadOptions.SPLIT_LOOKBACK_OPTION.defaultValue(); + private Long splitOpenFileCost = FlinkReadOptions.SPLIT_FILE_OPEN_COST_OPTION.defaultValue(); + private boolean isStreaming = FlinkReadOptions.STREAMING_OPTION.defaultValue(); + private Duration monitorInterval = + TimeUtils.parseDuration(FlinkReadOptions.MONITOR_INTERVAL_OPTION.defaultValue()); + private String nameMapping; + private Schema projectedSchema; + private List filters; + private long limit = FlinkReadOptions.LIMIT_OPTION.defaultValue(); + private boolean includeColumnStats = + FlinkReadOptions.INCLUDE_COLUMN_STATS_OPTION.defaultValue(); + private Collection includeStatsForColumns = null; + private boolean exposeLocality; + private Integer planParallelism = + FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue(); + private int maxPlanningSnapshotCount = + FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT_OPTION.defaultValue(); + private int maxAllowedPlanningFailures = + FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION.defaultValue(); + private String watermarkColumn = FlinkReadOptions.WATERMARK_COLUMN_OPTION.defaultValue(); + private TimeUnit watermarkColumnTimeUnit = + FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT_OPTION.defaultValue(); + + private Builder() {} + + public Builder caseSensitive(boolean newCaseSensitive) { + this.caseSensitive = newCaseSensitive; + return this; + } + + public Builder useSnapshotId(Long newSnapshotId) { + this.snapshotId = newSnapshotId; + return this; + } + + public Builder useTag(String newTag) { + this.tag = newTag; + return this; + } + + public Builder useBranch(String newBranch) { + this.branch = newBranch; + return this; + } + + public Builder startingStrategy(StreamingStartingStrategy newStartingStrategy) { + this.startingStrategy = newStartingStrategy; + return this; + } + + public Builder startSnapshotTimestamp(Long newStartSnapshotTimestamp) { + this.startSnapshotTimestamp = newStartSnapshotTimestamp; + return this; + } + + public Builder startSnapshotId(Long newStartSnapshotId) { + this.startSnapshotId = newStartSnapshotId; + return this; + } + + public Builder endSnapshotId(Long newEndSnapshotId) { + this.endSnapshotId = newEndSnapshotId; + return this; + } + + public Builder startTag(String newStartTag) { + this.startTag = newStartTag; + return this; + } + + public Builder endTag(String newEndTag) { + this.endTag = newEndTag; + return this; + } + + public Builder asOfTimestamp(Long newAsOfTimestamp) { + this.asOfTimestamp = newAsOfTimestamp; + return this; + } + + public Builder splitSize(Long newSplitSize) { + this.splitSize = newSplitSize; + return this; + } + + public Builder splitLookback(Integer newSplitLookback) { + this.splitLookback = newSplitLookback; + return this; + } + + public Builder splitOpenFileCost(Long newSplitOpenFileCost) { + this.splitOpenFileCost = newSplitOpenFileCost; + return this; + } + + public Builder streaming(boolean streaming) { + this.isStreaming = streaming; + return this; + } + + public Builder monitorInterval(Duration newMonitorInterval) { + this.monitorInterval = newMonitorInterval; + return this; + } + + public Builder nameMapping(String newNameMapping) { + this.nameMapping = newNameMapping; + return this; + } + + public Builder project(Schema newProjectedSchema) { + this.projectedSchema = newProjectedSchema; + return this; + } + + public Builder filters(List newFilters) { + this.filters = newFilters; + return this; + } + + public Builder limit(long newLimit) { + this.limit = newLimit; + return this; + } + + public Builder includeColumnStats(boolean newIncludeColumnStats) { + this.includeColumnStats = newIncludeColumnStats; + return this; + } + + public Builder includeColumnStats(Collection newIncludeStatsForColumns) { + this.includeStatsForColumns = newIncludeStatsForColumns; + return this; + } + + public Builder exposeLocality(boolean newExposeLocality) { + this.exposeLocality = newExposeLocality; + return this; + } + + public Builder planParallelism(Integer parallelism) { + this.planParallelism = parallelism; + return this; + } + + public Builder maxPlanningSnapshotCount(int newMaxPlanningSnapshotCount) { + this.maxPlanningSnapshotCount = newMaxPlanningSnapshotCount; + return this; + } + + public Builder maxAllowedPlanningFailures(int newMaxAllowedPlanningFailures) { + this.maxAllowedPlanningFailures = newMaxAllowedPlanningFailures; + return this; + } + + public Builder watermarkColumn(String newWatermarkColumn) { + this.watermarkColumn = newWatermarkColumn; + return this; + } + + public Builder watermarkColumnTimeUnit(TimeUnit newWatermarkTimeUnit) { + this.watermarkColumnTimeUnit = newWatermarkTimeUnit; + return this; + } + + public Builder resolveConfig( + Table table, Map readOptions, ReadableConfig readableConfig) { + FlinkReadConf flinkReadConf = new FlinkReadConf(table, readOptions, readableConfig); + + return this.useSnapshotId(flinkReadConf.snapshotId()) + .useTag(flinkReadConf.tag()) + .useBranch(flinkReadConf.branch()) + .startTag(flinkReadConf.startTag()) + .endTag(flinkReadConf.endTag()) + .caseSensitive(flinkReadConf.caseSensitive()) + .asOfTimestamp(flinkReadConf.asOfTimestamp()) + .startingStrategy(flinkReadConf.startingStrategy()) + .startSnapshotTimestamp(flinkReadConf.startSnapshotTimestamp()) + .startSnapshotId(flinkReadConf.startSnapshotId()) + .endSnapshotId(flinkReadConf.endSnapshotId()) + .splitSize(flinkReadConf.splitSize()) + .splitLookback(flinkReadConf.splitLookback()) + .splitOpenFileCost(flinkReadConf.splitFileOpenCost()) + .streaming(flinkReadConf.streaming()) + .monitorInterval(flinkReadConf.monitorInterval()) + .nameMapping(flinkReadConf.nameMapping()) + .limit(flinkReadConf.limit()) + .planParallelism(flinkReadConf.workerPoolSize()) + .includeColumnStats(flinkReadConf.includeColumnStats()) + .maxPlanningSnapshotCount(flinkReadConf.maxPlanningSnapshotCount()) + .maxAllowedPlanningFailures(flinkReadConf.maxAllowedPlanningFailures()) + .watermarkColumn(flinkReadConf.watermarkColumn()) + .watermarkColumnTimeUnit(flinkReadConf.watermarkColumnTimeUnit()); + } + + public ScanContext build() { + return new ScanContext( + caseSensitive, + snapshotId, + startingStrategy, + startSnapshotTimestamp, + startSnapshotId, + endSnapshotId, + asOfTimestamp, + splitSize, + splitLookback, + splitOpenFileCost, + isStreaming, + monitorInterval, + nameMapping, + projectedSchema, + filters, + limit, + includeColumnStats, + includeStatsForColumns, + exposeLocality, + planParallelism, + maxPlanningSnapshotCount, + maxAllowedPlanningFailures, + watermarkColumn, + watermarkColumnTimeUnit, + branch, + tag, + startTag, + endTag); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java new file mode 100644 index 000000000000..7c3a69dbc141 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.util.function.Supplier; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.table.api.config.ExecutionConfigOptions; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.hadoop.Util; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +class SourceUtil { + private SourceUtil() {} + + static boolean isLocalityEnabled( + Table table, ReadableConfig readableConfig, Boolean exposeLocality) { + Boolean localityEnabled = + exposeLocality != null + ? exposeLocality + : readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO); + + if (localityEnabled != null && !localityEnabled) { + return false; + } + + return Util.mayHaveBlockLocations(table.io(), table.location()); + } + + /** + * Infer source parallelism. + * + * @param readableConfig Flink config. + * @param splitCountProvider Split count supplier. As the computation may involve expensive split + * discover, lazy evaluation is performed if inferring parallelism is enabled. + * @param limitCount limited output count. + */ + static int inferParallelism( + ReadableConfig readableConfig, long limitCount, Supplier splitCountProvider) { + int parallelism = + readableConfig.get(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM); + if (readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM)) { + int maxInferParallelism = + readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX); + Preconditions.checkState( + maxInferParallelism >= 1, + FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX.key() + + " cannot be less than 1"); + parallelism = Math.min(splitCountProvider.get(), maxInferParallelism); + } + + if (limitCount > 0) { + int limit = limitCount >= Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) limitCount; + parallelism = Math.min(parallelism, limit); + } + + // parallelism must be positive. + parallelism = Math.max(1, parallelism); + return parallelism; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java new file mode 100644 index 000000000000..133859b657e5 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java @@ -0,0 +1,269 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.List; +import java.util.concurrent.ExecutorService; +import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.api.common.functions.RuntimeContext; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.common.typeutils.base.LongSerializer; +import org.apache.flink.runtime.state.FunctionInitializationContext; +import org.apache.flink.runtime.state.FunctionSnapshotContext; +import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; +import org.apache.flink.streaming.api.functions.source.legacy.RichSourceFunction; +import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.util.SnapshotUtil; +import org.apache.iceberg.util.ThreadPools; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This is the single (non-parallel) monitoring task which takes a {@link FlinkInputFormat}, it is + * responsible for: + * + *

      + *
    1. Monitoring snapshots of the Iceberg table. + *
    2. Creating the {@link FlinkInputSplit splits} corresponding to the incremental files + *
    3. Assigning them to downstream tasks for further processing. + *
    + * + *

    The splits to be read are forwarded to the downstream {@link StreamingReaderOperator} which + * can have parallelism greater than one. + */ +public class StreamingMonitorFunction extends RichSourceFunction + implements CheckpointedFunction { + + private static final Logger LOG = LoggerFactory.getLogger(StreamingMonitorFunction.class); + + private static final long INIT_LAST_SNAPSHOT_ID = -1L; + + private final TableLoader tableLoader; + private final ScanContext scanContext; + + private volatile boolean isRunning = true; + + // The checkpoint thread is not the same thread that running the function for SourceStreamTask + // now. It's necessary to + // mark this as volatile. + private volatile long lastSnapshotId = INIT_LAST_SNAPSHOT_ID; + + private transient SourceContext sourceContext; + private transient Table table; + private transient ListState lastSnapshotIdState; + private transient ExecutorService workerPool; + + public StreamingMonitorFunction(TableLoader tableLoader, ScanContext scanContext) { + Preconditions.checkArgument( + scanContext.snapshotId() == null, "Cannot set snapshot-id option for streaming reader"); + Preconditions.checkArgument( + scanContext.asOfTimestamp() == null, + "Cannot set as-of-timestamp option for streaming reader"); + Preconditions.checkArgument( + scanContext.endSnapshotId() == null, + "Cannot set end-snapshot-id option for streaming reader"); + Preconditions.checkArgument( + scanContext.endTag() == null, "Cannot set end-tag option for streaming reader"); + Preconditions.checkArgument( + scanContext.maxPlanningSnapshotCount() > 0, + "The max-planning-snapshot-count must be greater than zero"); + this.tableLoader = tableLoader; + this.scanContext = scanContext; + } + + @Override + public void open(OpenContext parameters) throws Exception { + super.open(parameters); + + final RuntimeContext runtimeContext = getRuntimeContext(); + ValidationException.check( + runtimeContext instanceof StreamingRuntimeContext, + "context should be instance of StreamingRuntimeContext"); + final String operatorID = ((StreamingRuntimeContext) runtimeContext).getOperatorUniqueID(); + this.workerPool = + ThreadPools.newFixedThreadPool( + "iceberg-worker-pool-" + operatorID, scanContext.planParallelism()); + } + + @Override + public void initializeState(FunctionInitializationContext context) throws Exception { + // Load iceberg table from table loader. + tableLoader.open(); + table = tableLoader.loadTable(); + + // Initialize the flink state for last snapshot id. + lastSnapshotIdState = + context + .getOperatorStateStore() + .getListState(new ListStateDescriptor<>("snapshot-id-state", LongSerializer.INSTANCE)); + + // Restore the last-snapshot-id from flink's state if possible. + if (context.isRestored()) { + LOG.info("Restoring state for the {}.", getClass().getSimpleName()); + lastSnapshotId = lastSnapshotIdState.get().iterator().next(); + } else if (scanContext.startTag() != null || scanContext.startSnapshotId() != null) { + Preconditions.checkArgument( + !(scanContext.startTag() != null && scanContext.startSnapshotId() != null), + "START_SNAPSHOT_ID and START_TAG cannot both be set."); + Preconditions.checkNotNull( + table.currentSnapshot(), "Don't have any available snapshot in table."); + + long startSnapshotId; + if (scanContext.startTag() != null) { + Preconditions.checkArgument( + table.snapshot(scanContext.startTag()) != null, + "Cannot find snapshot with tag %s in table.", + scanContext.startTag()); + startSnapshotId = table.snapshot(scanContext.startTag()).snapshotId(); + } else { + startSnapshotId = scanContext.startSnapshotId(); + } + + long currentSnapshotId = table.currentSnapshot().snapshotId(); + Preconditions.checkState( + SnapshotUtil.isAncestorOf(table, currentSnapshotId, startSnapshotId), + "The option start-snapshot-id %s is not an ancestor of the current snapshot.", + startSnapshotId); + + lastSnapshotId = startSnapshotId; + } + } + + @Override + public void snapshotState(FunctionSnapshotContext context) throws Exception { + lastSnapshotIdState.clear(); + lastSnapshotIdState.add(lastSnapshotId); + } + + @Override + public void run(SourceContext ctx) throws Exception { + this.sourceContext = ctx; + while (isRunning) { + monitorAndForwardSplits(); + Thread.sleep(scanContext.monitorInterval().toMillis()); + } + } + + private long toSnapshotIdInclusive( + long lastConsumedSnapshotId, long currentSnapshotId, int maxPlanningSnapshotCount) { + List snapshotIds = + SnapshotUtil.snapshotIdsBetween(table, lastConsumedSnapshotId, currentSnapshotId); + if (snapshotIds.size() <= maxPlanningSnapshotCount) { + return currentSnapshotId; + } else { + // It uses reverted index since snapshotIdsBetween returns Ids that are ordered by committed + // time descending. + return snapshotIds.get(snapshotIds.size() - maxPlanningSnapshotCount); + } + } + + @VisibleForTesting + void sourceContext(SourceContext ctx) { + this.sourceContext = ctx; + } + + @VisibleForTesting + void monitorAndForwardSplits() { + // Refresh the table to get the latest committed snapshot. + table.refresh(); + + Snapshot snapshot = + scanContext.branch() != null + ? table.snapshot(scanContext.branch()) + : table.currentSnapshot(); + if (snapshot != null && snapshot.snapshotId() != lastSnapshotId) { + long snapshotId = snapshot.snapshotId(); + + ScanContext newScanContext; + if (lastSnapshotId == INIT_LAST_SNAPSHOT_ID) { + newScanContext = scanContext.copyWithSnapshotId(snapshotId); + } else { + snapshotId = + toSnapshotIdInclusive( + lastSnapshotId, snapshotId, scanContext.maxPlanningSnapshotCount()); + newScanContext = scanContext.copyWithAppendsBetween(lastSnapshotId, snapshotId); + } + + LOG.debug( + "Start discovering splits from {} (exclusive) to {} (inclusive)", + lastSnapshotId, + snapshotId); + long start = System.currentTimeMillis(); + FlinkInputSplit[] splits = + FlinkSplitPlanner.planInputSplits(table, newScanContext, workerPool); + LOG.debug( + "Discovered {} splits, time elapsed {}ms", + splits.length, + System.currentTimeMillis() - start); + + // only need to hold the checkpoint lock when emitting the splits and updating lastSnapshotId + start = System.currentTimeMillis(); + synchronized (sourceContext.getCheckpointLock()) { + for (FlinkInputSplit split : splits) { + sourceContext.collect(split); + } + + lastSnapshotId = snapshotId; + } + LOG.debug( + "Forwarded {} splits, time elapsed {}ms", + splits.length, + System.currentTimeMillis() - start); + } + } + + @Override + public void cancel() { + // this is to cover the case where cancel() is called before the run() + if (sourceContext != null) { + synchronized (sourceContext.getCheckpointLock()) { + isRunning = false; + } + } else { + isRunning = false; + } + + // Release all the resources here. + if (tableLoader != null) { + try { + tableLoader.close(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + } + + @Override + public void close() { + cancel(); + + if (workerPool != null) { + workerPool.shutdown(); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java new file mode 100644 index 000000000000..6cc2ccd2c353 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java @@ -0,0 +1,247 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.util.Queue; +import org.apache.flink.api.common.operators.MailboxExecutor; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.runtime.state.JavaSerializer; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.apache.flink.runtime.state.StateSnapshotContext; +import org.apache.flink.streaming.api.functions.source.legacy.SourceFunction; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.StreamOperator; +import org.apache.flink.streaming.api.operators.StreamOperatorParameters; +import org.apache.flink.streaming.api.operators.StreamSourceContexts; +import org.apache.flink.streaming.api.operators.legacy.YieldingOperatorFactory; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.runtime.tasks.ProcessingTimeService; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * The operator that reads the {@link FlinkInputSplit splits} received from the preceding {@link + * StreamingMonitorFunction}. Contrary to the {@link StreamingMonitorFunction} which has a + * parallelism of 1, this operator can have multiple parallelism. + * + *

    As soon as a split descriptor is received, it is put in a queue, and use {@link + * MailboxExecutor} read the actual data of the split. This architecture allows the separation of + * the reading thread from the one split processing the checkpoint barriers, thus removing any + * potential back-pressure. + */ +public class StreamingReaderOperator extends AbstractStreamOperator + implements OneInputStreamOperator { + + private static final Logger LOG = LoggerFactory.getLogger(StreamingReaderOperator.class); + + // It's the same thread that is running this operator and checkpoint actions. we use this executor + // to schedule only + // one split for future reading, so that a new checkpoint could be triggered without blocking long + // time for exhausting + // all scheduled splits. + private final MailboxExecutor executor; + private FlinkInputFormat format; + + private transient SourceFunction.SourceContext sourceContext; + + private transient ListState inputSplitsState; + private transient Queue splits; + + // Splits are read by the same thread that calls processElement. Each read task is submitted to + // that thread by adding + // them to the executor. This state is used to ensure that only one read task is in that queue at + // a time, so that read + // tasks do not accumulate ahead of checkpoint tasks. When there is a read task in the queue, this + // is set to RUNNING. + // When there are no more files to read, this will be set to IDLE. + private transient SplitState currentSplitState; + + private StreamingReaderOperator( + StreamOperatorParameters parameters, + FlinkInputFormat format, + ProcessingTimeService timeService, + MailboxExecutor mailboxExecutor) { + super(parameters); + this.format = Preconditions.checkNotNull(format, "The InputFormat should not be null."); + this.processingTimeService = timeService; + this.executor = + Preconditions.checkNotNull(mailboxExecutor, "The mailboxExecutor should not be null."); + } + + @Override + public void initializeState(StateInitializationContext context) throws Exception { + super.initializeState(context); + + // TODO Replace Java serialization with Avro approach to keep state compatibility. + // See issue: https://github.com/apache/iceberg/issues/1698 + inputSplitsState = + context + .getOperatorStateStore() + .getListState(new ListStateDescriptor<>("splits", new JavaSerializer<>())); + + // Initialize the current split state to IDLE. + currentSplitState = SplitState.IDLE; + + // Recover splits state from flink state backend if possible. + splits = Lists.newLinkedList(); + if (context.isRestored()) { + int subtaskIdx = getRuntimeContext().getTaskInfo().getIndexOfThisSubtask(); + LOG.info("Restoring state for the {} (taskIdx: {}).", getClass().getSimpleName(), subtaskIdx); + + for (FlinkInputSplit split : inputSplitsState.get()) { + splits.add(split); + } + } + + this.sourceContext = + StreamSourceContexts.getSourceContext( + getProcessingTimeService(), + new Object(), // no actual locking needed + output, + getExecutionConfig().getAutoWatermarkInterval(), + -1, + true); + + // Enqueue to process the recovered input splits. + enqueueProcessSplits(); + } + + @Override + public void snapshotState(StateSnapshotContext context) throws Exception { + super.snapshotState(context); + + inputSplitsState.clear(); + inputSplitsState.addAll(Lists.newArrayList(splits)); + } + + @Override + public void processElement(StreamRecord element) { + splits.add(element.getValue()); + enqueueProcessSplits(); + } + + private void enqueueProcessSplits() { + if (currentSplitState == SplitState.IDLE && !splits.isEmpty()) { + currentSplitState = SplitState.RUNNING; + executor.execute(this::processSplits, this.getClass().getSimpleName()); + } + } + + private void processSplits() throws IOException { + FlinkInputSplit split = splits.poll(); + if (split == null) { + currentSplitState = SplitState.IDLE; + return; + } + + format.open(split); + try { + RowData nextElement = null; + while (!format.reachedEnd()) { + nextElement = format.nextRecord(nextElement); + sourceContext.collect(nextElement); + } + } finally { + currentSplitState = SplitState.IDLE; + format.close(); + } + + // Re-schedule to process the next split. + enqueueProcessSplits(); + } + + @Override + public void processWatermark(Watermark mark) { + // we do nothing because we emit our own watermarks if needed. + } + + @Override + public void close() throws Exception { + super.close(); + + if (format != null) { + format.close(); + format.closeInputFormat(); + format = null; + } + + sourceContext = null; + } + + @Override + public void finish() throws Exception { + super.finish(); + output.close(); + if (sourceContext != null) { + sourceContext.emitWatermark(Watermark.MAX_WATERMARK); + sourceContext.close(); + sourceContext = null; + } + } + + static OneInputStreamOperatorFactory factory(FlinkInputFormat format) { + return new OperatorFactory(format); + } + + private enum SplitState { + IDLE, + RUNNING + } + + private static class OperatorFactory extends AbstractStreamOperatorFactory + implements YieldingOperatorFactory, + OneInputStreamOperatorFactory { + + private final FlinkInputFormat format; + + private transient MailboxExecutor mailboxExecutor; + + private OperatorFactory(FlinkInputFormat format) { + this.format = format; + } + + @Override + public void setMailboxExecutor(MailboxExecutor mailboxExecutor) { + this.mailboxExecutor = mailboxExecutor; + } + + @SuppressWarnings("unchecked") + @Override + public > O createStreamOperator( + StreamOperatorParameters parameters) { + StreamingReaderOperator operator = + new StreamingReaderOperator(parameters, format, processingTimeService, mailboxExecutor); + return (O) operator; + } + + @Override + public Class getStreamOperatorClass(ClassLoader classLoader) { + return StreamingReaderOperator.class; + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java new file mode 100644 index 000000000000..fbeaace20934 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +/** Starting strategy for streaming execution. */ +public enum StreamingStartingStrategy { + /** + * Do a regular table scan then switch to the incremental mode. + * + *

    The incremental mode starts from the current snapshot exclusive. + */ + TABLE_SCAN_THEN_INCREMENTAL, + + /** + * Start incremental mode from the latest snapshot inclusive. + * + *

    If it is an empty table, all future append snapshots should be discovered. + */ + INCREMENTAL_FROM_LATEST_SNAPSHOT, + + /** + * Start incremental mode from the latest snapshot exclusive. + * + *

    If it is an empty table, all future append snapshots should be discovered. + */ + INCREMENTAL_FROM_LATEST_SNAPSHOT_EXCLUSIVE, + + /** + * Start incremental mode from the earliest snapshot inclusive. + * + *

    If it is an empty table, all future append snapshots should be discovered. + */ + INCREMENTAL_FROM_EARLIEST_SNAPSHOT, + + /** Start incremental mode from a snapshot with a specific id inclusive. */ + INCREMENTAL_FROM_SNAPSHOT_ID, + + /** + * Start incremental mode from a snapshot with a specific timestamp inclusive. + * + *

    If the timestamp is between two snapshots, it should start from the snapshot after the + * timestamp. + */ + INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java new file mode 100644 index 000000000000..e7447d08c985 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import java.util.ArrayDeque; +import java.util.Collection; +import java.util.PriorityQueue; +import java.util.Queue; +import java.util.concurrent.CompletableFuture; +import java.util.stream.Collectors; +import javax.annotation.Nullable; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitStatus; +import org.apache.iceberg.flink.source.split.SerializableComparator; + +/** + * Since all methods are called in the source coordinator thread by enumerator, there is no need for + * locking. + */ +@Internal +public class DefaultSplitAssigner implements SplitAssigner { + + private final Queue pendingSplits; + private CompletableFuture availableFuture; + + public DefaultSplitAssigner(SerializableComparator comparator) { + this.pendingSplits = comparator == null ? new ArrayDeque<>() : new PriorityQueue<>(comparator); + } + + public DefaultSplitAssigner( + SerializableComparator comparator, + Collection assignerState) { + this(comparator); + // Because default assigner only tracks unassigned splits, + // there is no need to filter splits based on status (unassigned) here. + assignerState.forEach(splitState -> pendingSplits.add(splitState.split())); + } + + @Override + public synchronized GetSplitResult getNext(@Nullable String hostname) { + if (pendingSplits.isEmpty()) { + return GetSplitResult.unavailable(); + } else { + IcebergSourceSplit split = pendingSplits.poll(); + return GetSplitResult.forSplit(split); + } + } + + @Override + public void onDiscoveredSplits(Collection splits) { + addSplits(splits); + } + + @Override + public void onUnassignedSplits(Collection splits) { + addSplits(splits); + } + + private synchronized void addSplits(Collection splits) { + if (!splits.isEmpty()) { + pendingSplits.addAll(splits); + // only complete pending future if new splits are discovered + completeAvailableFuturesIfNeeded(); + } + } + + /** Simple assigner only tracks unassigned splits */ + @Override + public synchronized Collection state() { + return pendingSplits.stream() + .map(split -> new IcebergSourceSplitState(split, IcebergSourceSplitStatus.UNASSIGNED)) + .collect(Collectors.toList()); + } + + @Override + public synchronized CompletableFuture isAvailable() { + if (availableFuture == null) { + availableFuture = new CompletableFuture<>(); + } + return availableFuture; + } + + @Override + public synchronized int pendingSplitCount() { + return pendingSplits.size(); + } + + @Override + public long pendingRecords() { + return pendingSplits.stream() + .map(split -> split.task().estimatedRowsCount()) + .reduce(0L, Long::sum); + } + + private synchronized void completeAvailableFuturesIfNeeded() { + if (availableFuture != null && !pendingSplits.isEmpty()) { + availableFuture.complete(null); + } + availableFuture = null; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java new file mode 100644 index 000000000000..36552782b6c1 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +@Internal +public class GetSplitResult { + + public enum Status { + AVAILABLE, + + /** + * There are pending splits. But they can't be assigned due to constraints (like event time + * alignment) + */ + CONSTRAINED, + + /** Assigner doesn't have pending splits. */ + UNAVAILABLE + } + + private final Status status; + private final IcebergSourceSplit split; + + private GetSplitResult(Status status) { + this.status = status; + this.split = null; + } + + private GetSplitResult(IcebergSourceSplit split) { + Preconditions.checkNotNull(split, "Split cannot be null"); + this.status = Status.AVAILABLE; + this.split = split; + } + + public Status status() { + return status; + } + + public IcebergSourceSplit split() { + return split; + } + + private static final GetSplitResult UNAVAILABLE = new GetSplitResult(Status.UNAVAILABLE); + private static final GetSplitResult CONSTRAINED = new GetSplitResult(Status.CONSTRAINED); + + public static GetSplitResult unavailable() { + return UNAVAILABLE; + } + + public static GetSplitResult constrained() { + return CONSTRAINED; + } + + public static GetSplitResult forSplit(IcebergSourceSplit split) { + return new GetSplitResult(split); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java new file mode 100644 index 000000000000..e58478897aef --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import java.util.Collection; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; +import org.apache.iceberg.flink.source.split.SerializableComparator; + +/** + * Create default assigner with a comparator that hands out splits where the order of the splits + * will be defined by the {@link SerializableComparator}. + */ +public class OrderedSplitAssignerFactory implements SplitAssignerFactory { + private final SerializableComparator comparator; + + public OrderedSplitAssignerFactory(SerializableComparator comparator) { + this.comparator = comparator; + } + + @Override + public SplitAssigner createAssigner() { + return new DefaultSplitAssigner(comparator); + } + + @Override + public SplitAssigner createAssigner(Collection assignerState) { + return new DefaultSplitAssigner(comparator, assignerState); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java new file mode 100644 index 000000000000..a2e2ff364d46 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import java.util.Collection; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; + +/** Create simple assigner that hands out splits without any guarantee in order or locality. */ +public class SimpleSplitAssignerFactory implements SplitAssignerFactory { + public SimpleSplitAssignerFactory() {} + + @Override + public SplitAssigner createAssigner() { + return new DefaultSplitAssigner(null); + } + + @Override + public SplitAssigner createAssigner(Collection assignerState) { + return new DefaultSplitAssigner(null, assignerState); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java new file mode 100644 index 000000000000..dae7c8cca70c --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import java.io.Closeable; +import java.util.Collection; +import java.util.concurrent.CompletableFuture; +import javax.annotation.Nullable; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; + +/** + * SplitAssigner interface is extracted out as a separate component so that we can plug in different + * split assignment strategy for different requirements. E.g. + * + *

      + *
    • Simple assigner with no ordering guarantee or locality aware optimization. + *
    • Locality aware assigner that prefer splits that are local. + *
    • Snapshot aware assigner that assign splits based on the order they are committed. + *
    • Event time alignment assigner that assign splits satisfying certain time ordering within a + * single source or across sources. + *
    + * + *

    Assigner implementation needs to be thread safe. Enumerator call the assigner APIs mostly from + * the coordinator thread. But enumerator may call the {@link SplitAssigner#pendingSplitCount()} + * from the I/O threads. + */ +public interface SplitAssigner extends Closeable { + + /** + * Some assigners may need to start background threads or perform other activity such as + * registering as listeners to updates from other event sources e.g., watermark tracker. + */ + default void start() {} + + /** + * Some assigners may need to perform certain actions when their corresponding enumerators are + * closed + */ + @Override + default void close() {} + + /** + * Request a new split from the assigner when enumerator trying to assign splits to awaiting + * readers. + * + *

    If enumerator wasn't able to assign the split (e.g., reader disconnected), enumerator should + * call {@link SplitAssigner#onUnassignedSplits} to return the split. + */ + GetSplitResult getNext(@Nullable String hostname); + + /** Add new splits discovered by enumerator */ + void onDiscoveredSplits(Collection splits); + + /** Forward addSplitsBack event (for failed reader) to assigner */ + void onUnassignedSplits(Collection splits); + + /** + * Some assigner (like event time alignment) may rack in-progress splits to advance watermark upon + * completed splits + */ + default void onCompletedSplits(Collection completedSplitIds) {} + + /** + * Get assigner state for checkpointing. This is a super-set API that works for all currently + * imagined assigners. + */ + Collection state(); + + /** + * Enumerator can get a notification via CompletableFuture when the assigner has more splits + * available later. Enumerator should schedule assignment in the thenAccept action of the future. + * + *

    Assigner will return the same future if this method is called again before the previous + * future is completed. + * + *

    The future can be completed from other thread, e.g. the coordinator thread from another + * thread for event time alignment. + * + *

    If enumerator need to trigger action upon the future completion, it may want to run it in + * the coordinator thread using {@link SplitEnumeratorContext#runInCoordinatorThread(Runnable)}. + */ + CompletableFuture isAvailable(); + + /** + * Return the number of pending splits that haven't been assigned yet. + * + *

    The enumerator can poll this API to publish a metric on the number of pending splits. + * + *

    The enumerator can also use this information to throttle split discovery for streaming read. + * If there are already many pending splits tracked by the assigner, it is undesirable to discover + * more splits and track them in the assigner. That will increase the memory footprint and + * enumerator checkpoint size. + * + *

    Throttling works better together with {@link ScanContext#maxPlanningSnapshotCount()}. + * Otherwise, the next split discovery after throttling will just discover all non-enumerated + * snapshots and splits, which defeats the purpose of throttling. + */ + int pendingSplitCount(); + + /** + * Return the number of pending records, which can act as a measure of the source lag. This value + * could be an estimation if the exact number of records cannot be accurately computed. + */ + long pendingRecords(); +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java new file mode 100644 index 000000000000..6e02a556ffcd --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import java.io.Serializable; +import java.util.Collection; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; + +public interface SplitAssignerFactory extends Serializable { + + SplitAssigner createAssigner(); + + SplitAssigner createAssigner(Collection assignerState); +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java new file mode 100644 index 000000000000..03ba67a554f9 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import org.apache.flink.annotation.Internal; + +@Internal +public enum SplitAssignerType { + SIMPLE { + @Override + public SplitAssignerFactory factory() { + return new SimpleSplitAssignerFactory(); + } + }; + + public abstract SplitAssignerFactory factory(); +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java new file mode 100644 index 000000000000..fc310606dee9 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.io.IOException; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.atomic.AtomicReference; +import javax.annotation.Nullable; +import org.apache.flink.api.connector.source.SourceEvent; +import org.apache.flink.api.connector.source.SplitEnumerator; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.flink.api.connector.source.SupportsHandleExecutionAttemptSourceEvent; +import org.apache.iceberg.flink.source.assigner.GetSplitResult; +import org.apache.iceberg.flink.source.assigner.SplitAssigner; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.SplitRequestEvent; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +abstract class AbstractIcebergEnumerator + implements SplitEnumerator, + SupportsHandleExecutionAttemptSourceEvent { + private static final Logger LOG = LoggerFactory.getLogger(AbstractIcebergEnumerator.class); + + private final SplitEnumeratorContext enumeratorContext; + private final SplitAssigner assigner; + private final Map readersAwaitingSplit; + private final AtomicReference> availableFuture; + + AbstractIcebergEnumerator( + SplitEnumeratorContext enumeratorContext, SplitAssigner assigner) { + this.enumeratorContext = enumeratorContext; + this.assigner = assigner; + this.readersAwaitingSplit = new LinkedHashMap<>(); + this.availableFuture = new AtomicReference<>(); + this.enumeratorContext + .metricGroup() + // This number may not capture the entire backlog due to split discovery throttling to avoid + // excessive memory footprint. Some pending splits may not have been discovered yet. + .setUnassignedSplitsGauge(() -> Long.valueOf(assigner.pendingSplitCount())); + this.enumeratorContext.metricGroup().gauge("pendingRecords", assigner::pendingRecords); + } + + @Override + public void start() { + assigner.start(); + } + + @Override + public void close() throws IOException { + assigner.close(); + } + + @Override + public void handleSplitRequest(int subtaskId, @Nullable String requesterHostname) { + // Iceberg source uses custom split request event to piggyback finished split ids. + throw new UnsupportedOperationException( + String.format( + Locale.ROOT, + "Received invalid default split request event " + + "from subtask %d as Iceberg source uses custom split request event", + subtaskId)); + } + + @Override + public void handleSourceEvent(int subtaskId, SourceEvent sourceEvent) { + if (sourceEvent instanceof SplitRequestEvent) { + SplitRequestEvent splitRequestEvent = (SplitRequestEvent) sourceEvent; + LOG.info("Received request split event from subtask {}", subtaskId); + assigner.onCompletedSplits(splitRequestEvent.finishedSplitIds()); + readersAwaitingSplit.put(subtaskId, splitRequestEvent.requesterHostname()); + assignSplits(); + } else { + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "Received unknown event from subtask %d: %s", + subtaskId, + sourceEvent.getClass().getCanonicalName())); + } + } + + // Flink's SourceCoordinator already keeps track of subTask to splits mapping. + // It already takes care of re-assigning splits to speculated attempts as well. + @Override + public void handleSourceEvent(int subTaskId, int attemptNumber, SourceEvent sourceEvent) { + handleSourceEvent(subTaskId, sourceEvent); + } + + @Override + public void addSplitsBack(List splits, int subtaskId) { + LOG.info("Add {} splits back to the pool for failed subtask {}", splits.size(), subtaskId); + assigner.onUnassignedSplits(splits); + assignSplits(); + } + + @Override + public void addReader(int subtaskId) { + LOG.info("Added reader: {}", subtaskId); + } + + private void assignSplits() { + LOG.info("Assigning splits for {} awaiting readers", readersAwaitingSplit.size()); + Iterator> awaitingReader = + readersAwaitingSplit.entrySet().iterator(); + while (awaitingReader.hasNext()) { + Map.Entry nextAwaiting = awaitingReader.next(); + // if the reader that requested another split has failed in the meantime, remove + // it from the list of waiting readers + if (!enumeratorContext.registeredReaders().containsKey(nextAwaiting.getKey())) { + awaitingReader.remove(); + continue; + } + + int awaitingSubtask = nextAwaiting.getKey(); + String hostname = nextAwaiting.getValue(); + GetSplitResult getResult = assigner.getNext(hostname); + if (getResult.status() == GetSplitResult.Status.AVAILABLE) { + LOG.info("Assign split to subtask {}: {}", awaitingSubtask, getResult.split()); + enumeratorContext.assignSplit(getResult.split(), awaitingSubtask); + awaitingReader.remove(); + } else if (getResult.status() == GetSplitResult.Status.CONSTRAINED) { + getAvailableFutureIfNeeded(); + break; + } else if (getResult.status() == GetSplitResult.Status.UNAVAILABLE) { + if (shouldWaitForMoreSplits()) { + getAvailableFutureIfNeeded(); + break; + } else { + LOG.info("No more splits available for subtask {}", awaitingSubtask); + enumeratorContext.signalNoMoreSplits(awaitingSubtask); + awaitingReader.remove(); + } + } else { + throw new IllegalArgumentException("Unsupported status: " + getResult.status()); + } + } + } + + /** return true if enumerator should wait for splits like in the continuous enumerator case */ + protected abstract boolean shouldWaitForMoreSplits(); + + private synchronized void getAvailableFutureIfNeeded() { + if (availableFuture.get() != null) { + return; + } + + CompletableFuture future = + assigner + .isAvailable() + .thenAccept( + ignore -> + // Must run assignSplits in coordinator thread + // because the future may be completed from other threads. + // E.g., in event time alignment assigner, + // watermark advancement from another source may + // cause the available future to be completed + enumeratorContext.runInCoordinatorThread( + () -> { + LOG.debug("Executing callback of assignSplits"); + availableFuture.set(null); + assignSplits(); + })); + availableFuture.set(future); + LOG.debug("Registered callback for future available splits"); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java new file mode 100644 index 000000000000..41863ffee60b --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.util.Collection; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +class ContinuousEnumerationResult { + private final Collection splits; + private final IcebergEnumeratorPosition fromPosition; + private final IcebergEnumeratorPosition toPosition; + + /** + * @param splits should never be null. But it can be an empty collection + * @param fromPosition can be null + * @param toPosition should never be null. But it can have null snapshotId and snapshotTimestampMs + */ + ContinuousEnumerationResult( + Collection splits, + IcebergEnumeratorPosition fromPosition, + IcebergEnumeratorPosition toPosition) { + Preconditions.checkArgument(splits != null, "Invalid to splits collection: null"); + Preconditions.checkArgument(toPosition != null, "Invalid end position: null"); + this.splits = splits; + this.fromPosition = fromPosition; + this.toPosition = toPosition; + } + + public Collection splits() { + return splits; + } + + public IcebergEnumeratorPosition fromPosition() { + return fromPosition; + } + + public IcebergEnumeratorPosition toPosition() { + return toPosition; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java new file mode 100644 index 000000000000..c50c3854ee14 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.io.IOException; +import java.util.Collections; +import java.util.Objects; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; +import javax.annotation.Nullable; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.assigner.SplitAssigner; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.util.ElapsedTimeGauge; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Internal +public class ContinuousIcebergEnumerator extends AbstractIcebergEnumerator { + + private static final Logger LOG = LoggerFactory.getLogger(ContinuousIcebergEnumerator.class); + + /** + * This is hardcoded, as {@link ScanContext#maxPlanningSnapshotCount()} could be the knob to + * control the total number of snapshots worth of splits tracked by assigner. + */ + private static final int ENUMERATION_SPLIT_COUNT_HISTORY_SIZE = 3; + + private final SplitEnumeratorContext enumeratorContext; + private final SplitAssigner assigner; + private final ScanContext scanContext; + private final ContinuousSplitPlanner splitPlanner; + + /** + * snapshotId for the last enumerated snapshot. next incremental enumeration should be based off + * this as the starting position. + */ + private final AtomicReference enumeratorPosition; + + /** Track enumeration result history for split discovery throttling. */ + private final EnumerationHistory enumerationHistory; + + /** Count the consecutive failures and throw exception if the max allowed failres are reached */ + private transient int consecutiveFailures = 0; + + private final ElapsedTimeGauge elapsedSecondsSinceLastSplitDiscovery; + + public ContinuousIcebergEnumerator( + SplitEnumeratorContext enumeratorContext, + SplitAssigner assigner, + ScanContext scanContext, + ContinuousSplitPlanner splitPlanner, + @Nullable IcebergEnumeratorState enumState) { + super(enumeratorContext, assigner); + + this.enumeratorContext = enumeratorContext; + this.assigner = assigner; + this.scanContext = scanContext; + this.splitPlanner = splitPlanner; + this.enumeratorPosition = new AtomicReference<>(); + this.enumerationHistory = new EnumerationHistory(ENUMERATION_SPLIT_COUNT_HISTORY_SIZE); + this.elapsedSecondsSinceLastSplitDiscovery = new ElapsedTimeGauge(TimeUnit.SECONDS); + this.enumeratorContext + .metricGroup() + .gauge("elapsedSecondsSinceLastSplitDiscovery", elapsedSecondsSinceLastSplitDiscovery); + + if (enumState != null) { + this.enumeratorPosition.set(enumState.lastEnumeratedPosition()); + this.enumerationHistory.restore(enumState.enumerationSplitCountHistory()); + } + } + + @Override + public void start() { + super.start(); + enumeratorContext.callAsync( + this::discoverSplits, + this::processDiscoveredSplits, + 0L, + scanContext.monitorInterval().toMillis()); + } + + @Override + public void close() throws IOException { + splitPlanner.close(); + super.close(); + } + + @Override + protected boolean shouldWaitForMoreSplits() { + return true; + } + + @Override + public IcebergEnumeratorState snapshotState(long checkpointId) { + return new IcebergEnumeratorState( + enumeratorPosition.get(), assigner.state(), enumerationHistory.snapshot()); + } + + /** This method is executed in an IO thread pool. */ + private ContinuousEnumerationResult discoverSplits() { + int pendingSplitCountFromAssigner = assigner.pendingSplitCount(); + if (enumerationHistory.shouldPauseSplitDiscovery(pendingSplitCountFromAssigner)) { + // If the assigner already has many pending splits, it is better to pause split discovery. + // Otherwise, eagerly discovering more splits will just increase assigner memory footprint + // and enumerator checkpoint state size. + LOG.info( + "Pause split discovery as the assigner already has too many pending splits: {}", + pendingSplitCountFromAssigner); + return new ContinuousEnumerationResult( + Collections.emptyList(), enumeratorPosition.get(), enumeratorPosition.get()); + } else { + return splitPlanner.planSplits(enumeratorPosition.get()); + } + } + + /** This method is executed in a single coordinator thread. */ + private void processDiscoveredSplits(ContinuousEnumerationResult result, Throwable error) { + if (error == null) { + consecutiveFailures = 0; + if (!Objects.equals(result.fromPosition(), enumeratorPosition.get())) { + // Multiple discoverSplits() may be triggered with the same starting snapshot to the I/O + // thread pool. E.g., the splitDiscoveryInterval is very short (like 10 ms in some unit + // tests) or the thread pool is busy and multiple discovery actions are executed + // concurrently. Discovery result should only be accepted if the starting position + // matches the enumerator position (like compare-and-swap). + LOG.info( + "Skip {} discovered splits because the scan starting position doesn't match " + + "the current enumerator position: enumerator position = {}, scan starting position = {}", + result.splits().size(), + enumeratorPosition.get(), + result.fromPosition()); + } else { + elapsedSecondsSinceLastSplitDiscovery.refreshLastRecordedTime(); + // Sometimes, enumeration may yield no splits for a few reasons. + // - upstream paused or delayed streaming writes to the Iceberg table. + // - enumeration frequency is higher than the upstream write frequency. + if (!result.splits().isEmpty()) { + assigner.onDiscoveredSplits(result.splits()); + // EnumerationHistory makes throttling decision on split discovery + // based on the total number of splits discovered in the last a few cycles. + // Only update enumeration history when there are some discovered splits. + enumerationHistory.add(result.splits().size()); + LOG.info( + "Added {} splits discovered between ({}, {}] to the assigner", + result.splits().size(), + result.fromPosition(), + result.toPosition()); + } else { + LOG.info( + "No new splits discovered between ({}, {}]", + result.fromPosition(), + result.toPosition()); + } + // update the enumerator position even if there is no split discovered + // or the toPosition is empty (e.g. for empty table). + enumeratorPosition.set(result.toPosition()); + LOG.info("Update enumerator position to {}", result.toPosition()); + } + } else { + consecutiveFailures++; + if (scanContext.maxAllowedPlanningFailures() < 0 + || consecutiveFailures <= scanContext.maxAllowedPlanningFailures()) { + LOG.error("Failed to discover new splits", error); + } else { + throw new RuntimeException("Failed to discover new splits", error); + } + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java new file mode 100644 index 000000000000..2a1325178873 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.io.Closeable; +import org.apache.flink.annotation.Internal; + +/** This interface is introduced so that we can plug in different split planner for unit test */ +@Internal +public interface ContinuousSplitPlanner extends Closeable { + + /** Discover the files appended between {@code lastPosition} and current table snapshot */ + ContinuousEnumerationResult planSplits(IcebergEnumeratorPosition lastPosition); +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java new file mode 100644 index 000000000000..e8478b8ea89d --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java @@ -0,0 +1,248 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.ExecutorService; +import org.apache.flink.annotation.Internal; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.source.FlinkSplitPlanner; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.StreamingStartingStrategy; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.SnapshotUtil; +import org.apache.iceberg.util.ThreadPools; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Internal +public class ContinuousSplitPlannerImpl implements ContinuousSplitPlanner { + private static final Logger LOG = LoggerFactory.getLogger(ContinuousSplitPlannerImpl.class); + + private final Table table; + private final ScanContext scanContext; + private final boolean isSharedPool; + private final ExecutorService workerPool; + private final TableLoader tableLoader; + + /** + * @param tableLoader A cloned tableLoader. + * @param threadName thread name prefix for worker pool to run the split planning. If null, a + * shared worker pool will be used. + */ + public ContinuousSplitPlannerImpl( + TableLoader tableLoader, ScanContext scanContext, String threadName) { + this.tableLoader = tableLoader.clone(); + this.tableLoader.open(); + this.table = this.tableLoader.loadTable(); + this.scanContext = scanContext; + this.isSharedPool = threadName == null; + this.workerPool = + isSharedPool + ? ThreadPools.getWorkerPool() + : ThreadPools.newFixedThreadPool( + "iceberg-plan-worker-pool-" + threadName, scanContext.planParallelism()); + } + + @Override + public void close() throws IOException { + if (!isSharedPool) { + workerPool.shutdown(); + } + tableLoader.close(); + } + + @Override + public ContinuousEnumerationResult planSplits(IcebergEnumeratorPosition lastPosition) { + table.refresh(); + if (lastPosition != null) { + return discoverIncrementalSplits(lastPosition); + } else { + return discoverInitialSplits(); + } + } + + private Snapshot toSnapshotInclusive( + Long lastConsumedSnapshotId, Snapshot currentSnapshot, int maxPlanningSnapshotCount) { + // snapshots are in reverse order (latest snapshot first) + List snapshots = + Lists.newArrayList( + SnapshotUtil.ancestorsBetween( + table, currentSnapshot.snapshotId(), lastConsumedSnapshotId)); + if (snapshots.size() <= maxPlanningSnapshotCount) { + return currentSnapshot; + } else { + // Because snapshots are in reverse order of commit history, this index returns + // the max allowed number of snapshots from the lastConsumedSnapshotId. + return snapshots.get(snapshots.size() - maxPlanningSnapshotCount); + } + } + + private ContinuousEnumerationResult discoverIncrementalSplits( + IcebergEnumeratorPosition lastPosition) { + Snapshot currentSnapshot = + scanContext.branch() != null + ? table.snapshot(scanContext.branch()) + : table.currentSnapshot(); + + if (currentSnapshot == null) { + // empty table + Preconditions.checkArgument( + lastPosition.snapshotId() == null, + "Invalid last enumerated position for an empty table: not null"); + LOG.info("Skip incremental scan because table is empty"); + return new ContinuousEnumerationResult(Collections.emptyList(), lastPosition, lastPosition); + } else if (lastPosition.snapshotId() != null + && currentSnapshot.snapshotId() == lastPosition.snapshotId()) { + LOG.info("Current table snapshot is already enumerated: {}", currentSnapshot.snapshotId()); + return new ContinuousEnumerationResult(Collections.emptyList(), lastPosition, lastPosition); + } else { + Long lastConsumedSnapshotId = lastPosition.snapshotId(); + Snapshot toSnapshotInclusive = + toSnapshotInclusive( + lastConsumedSnapshotId, currentSnapshot, scanContext.maxPlanningSnapshotCount()); + IcebergEnumeratorPosition newPosition = + IcebergEnumeratorPosition.of( + toSnapshotInclusive.snapshotId(), toSnapshotInclusive.timestampMillis()); + ScanContext incrementalScan = + scanContext.copyWithAppendsBetween( + lastPosition.snapshotId(), toSnapshotInclusive.snapshotId()); + List splits = + FlinkSplitPlanner.planIcebergSourceSplits(table, incrementalScan, workerPool); + LOG.info( + "Discovered {} splits from incremental scan: " + + "from snapshot (exclusive) is {}, to snapshot (inclusive) is {}", + splits.size(), + lastPosition, + newPosition); + return new ContinuousEnumerationResult(splits, lastPosition, newPosition); + } + } + + /** + * Discovery initial set of splits based on {@link StreamingStartingStrategy}. + *

  • {@link ContinuousEnumerationResult#splits()} should contain initial splits discovered from + * table scan for {@link StreamingStartingStrategy#TABLE_SCAN_THEN_INCREMENTAL}. For all other + * strategies, splits collection should be empty. + *
  • {@link ContinuousEnumerationResult#toPosition()} points to the starting position for the + * next incremental split discovery with exclusive behavior. Meaning files committed by the + * snapshot from the position in {@code ContinuousEnumerationResult} won't be included in the + * next incremental scan. + */ + private ContinuousEnumerationResult discoverInitialSplits() { + Optional startSnapshotOptional = startSnapshot(table, scanContext); + if (!startSnapshotOptional.isPresent()) { + return new ContinuousEnumerationResult( + Collections.emptyList(), null, IcebergEnumeratorPosition.empty()); + } + + Snapshot startSnapshot = startSnapshotOptional.get(); + LOG.info( + "Get starting snapshot id {} based on strategy {}", + startSnapshot.snapshotId(), + scanContext.streamingStartingStrategy()); + List splits = Collections.emptyList(); + IcebergEnumeratorPosition toPosition; + if (scanContext.streamingStartingStrategy() + == StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) { + // do a batch table scan first + splits = + FlinkSplitPlanner.planIcebergSourceSplits( + table, scanContext.copyWithSnapshotId(startSnapshot.snapshotId()), workerPool); + LOG.info( + "Discovered {} splits from initial batch table scan with snapshot Id {}", + splits.size(), + startSnapshot.snapshotId()); + // For TABLE_SCAN_THEN_INCREMENTAL, incremental mode starts exclusive from the startSnapshot + toPosition = + IcebergEnumeratorPosition.of(startSnapshot.snapshotId(), startSnapshot.timestampMillis()); + } else if (scanContext.streamingStartingStrategy() + == StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT_EXCLUSIVE) { + toPosition = + IcebergEnumeratorPosition.of(startSnapshot.snapshotId(), startSnapshot.timestampMillis()); + LOG.info( + "Start incremental scan with start snapshot (exclusive): id = {}, timestamp = {}", + startSnapshot.snapshotId(), + startSnapshot.timestampMillis()); + } else { + // For all other modes, starting snapshot should be consumed inclusively. + // Use parentId to achieve the inclusive behavior. It is fine if parentId is null. + Long parentSnapshotId = startSnapshot.parentId(); + if (parentSnapshotId != null) { + Snapshot parentSnapshot = table.snapshot(parentSnapshotId); + Long parentSnapshotTimestampMs = + parentSnapshot != null ? parentSnapshot.timestampMillis() : null; + toPosition = IcebergEnumeratorPosition.of(parentSnapshotId, parentSnapshotTimestampMs); + } else { + toPosition = IcebergEnumeratorPosition.empty(); + } + + LOG.info( + "Start incremental scan with start snapshot (inclusive): id = {}, timestamp = {}", + startSnapshot.snapshotId(), + startSnapshot.timestampMillis()); + } + + return new ContinuousEnumerationResult(splits, null, toPosition); + } + + /** + * Calculate the starting snapshot based on the {@link StreamingStartingStrategy} defined in + * {@code ScanContext}. + * + *

    If the {@link StreamingStartingStrategy} is not {@link + * StreamingStartingStrategy#TABLE_SCAN_THEN_INCREMENTAL}, the start snapshot should be consumed + * inclusively. + */ + @VisibleForTesting + static Optional startSnapshot(Table table, ScanContext scanContext) { + switch (scanContext.streamingStartingStrategy()) { + case TABLE_SCAN_THEN_INCREMENTAL: + case INCREMENTAL_FROM_LATEST_SNAPSHOT: + case INCREMENTAL_FROM_LATEST_SNAPSHOT_EXCLUSIVE: + return Optional.ofNullable(table.currentSnapshot()); + case INCREMENTAL_FROM_EARLIEST_SNAPSHOT: + return Optional.ofNullable(SnapshotUtil.oldestAncestor(table)); + case INCREMENTAL_FROM_SNAPSHOT_ID: + Snapshot matchedSnapshotById = table.snapshot(scanContext.startSnapshotId()); + Preconditions.checkArgument( + matchedSnapshotById != null, + "Start snapshot id not found in history: " + scanContext.startSnapshotId()); + return Optional.of(matchedSnapshotById); + case INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP: + Snapshot matchedSnapshotByTimestamp = + SnapshotUtil.oldestAncestorAfter(table, scanContext.startSnapshotTimestamp()); + Preconditions.checkArgument( + matchedSnapshotByTimestamp != null, + "Cannot find a snapshot after: " + scanContext.startSnapshotTimestamp()); + return Optional.of(matchedSnapshotByTimestamp); + default: + throw new IllegalArgumentException( + "Unknown starting strategy: " + scanContext.streamingStartingStrategy()); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java new file mode 100644 index 000000000000..ec56a9ecdac1 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.util.Arrays; +import javax.annotation.concurrent.ThreadSafe; +import org.apache.flink.annotation.VisibleForTesting; + +/** + * This enumeration history is used for split discovery throttling. It tracks the discovered split + * count per every non-empty enumeration. + */ +@ThreadSafe +class EnumerationHistory { + + private final int[] history; + // int (2B) should be enough without overflow for enumeration history + private int count; + + EnumerationHistory(int maxHistorySize) { + this.history = new int[maxHistorySize]; + } + + synchronized void restore(int[] restoredHistory) { + int startingOffset = 0; + int restoreSize = restoredHistory.length; + + if (restoredHistory.length > history.length) { + // keep the newest history + startingOffset = restoredHistory.length - history.length; + // only restore the latest history up to maxHistorySize + restoreSize = history.length; + } + + System.arraycopy(restoredHistory, startingOffset, history, 0, restoreSize); + count = restoreSize; + } + + synchronized int[] snapshot() { + int len = history.length; + if (count > len) { + int[] copy = new int[len]; + // this is like a circular buffer + int indexForOldest = count % len; + System.arraycopy(history, indexForOldest, copy, 0, len - indexForOldest); + System.arraycopy(history, 0, copy, len - indexForOldest, indexForOldest); + return copy; + } else { + return Arrays.copyOfRange(history, 0, count); + } + } + + /** Add the split count from the last enumeration result. */ + synchronized void add(int splitCount) { + int pos = count % history.length; + history[pos] = splitCount; + count += 1; + } + + @VisibleForTesting + synchronized boolean hasFullHistory() { + return count >= history.length; + } + + /** + * Checks whether split discovery should be paused. + * + * @return true if split discovery should pause because assigner has too many splits already. + */ + synchronized boolean shouldPauseSplitDiscovery(int pendingSplitCountFromAssigner) { + if (count < history.length) { + // only check throttling when full history is obtained. + return false; + } else { + // if ScanContext#maxPlanningSnapshotCount() is 10, each split enumeration can + // discovery splits up to 10 snapshots. if maxHistorySize is 3, the max number of + // splits tracked in assigner shouldn't be more than 10 * (3 + 1) snapshots + // worth of splits. +1 because there could be another enumeration when the + // pending splits fall just below the 10 * 3. + int totalSplitCountFromRecentDiscovery = Arrays.stream(history).reduce(0, Integer::sum); + return pendingSplitCountFromAssigner >= totalSplitCountFromRecentDiscovery; + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java new file mode 100644 index 000000000000..96aba296f8cf --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Objects; + +class IcebergEnumeratorPosition { + private final Long snapshotId; + // Track snapshot timestamp mainly for info logging + private final Long snapshotTimestampMs; + + static IcebergEnumeratorPosition empty() { + return new IcebergEnumeratorPosition(null, null); + } + + static IcebergEnumeratorPosition of(long snapshotId, Long snapshotTimestampMs) { + return new IcebergEnumeratorPosition(snapshotId, snapshotTimestampMs); + } + + private IcebergEnumeratorPosition(Long snapshotId, Long snapshotTimestampMs) { + this.snapshotId = snapshotId; + this.snapshotTimestampMs = snapshotTimestampMs; + } + + boolean isEmpty() { + return snapshotId == null; + } + + Long snapshotId() { + return snapshotId; + } + + Long snapshotTimestampMs() { + return snapshotTimestampMs; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("snapshotId", snapshotId) + .add("snapshotTimestampMs", snapshotTimestampMs) + .toString(); + } + + @Override + public int hashCode() { + return Objects.hashCode(snapshotId, snapshotTimestampMs); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + IcebergEnumeratorPosition other = (IcebergEnumeratorPosition) o; + return Objects.equal(snapshotId, other.snapshotId()) + && Objects.equal(snapshotTimestampMs, other.snapshotTimestampMs()); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java new file mode 100644 index 000000000000..1c63807361c5 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.io.IOException; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; + +class IcebergEnumeratorPositionSerializer + implements SimpleVersionedSerializer { + + public static final IcebergEnumeratorPositionSerializer INSTANCE = + new IcebergEnumeratorPositionSerializer(); + + private static final int VERSION = 1; + + private static final ThreadLocal SERIALIZER_CACHE = + ThreadLocal.withInitial(() -> new DataOutputSerializer(128)); + + @Override + public int getVersion() { + return VERSION; + } + + @Override + public byte[] serialize(IcebergEnumeratorPosition position) throws IOException { + return serializeV1(position); + } + + @Override + public IcebergEnumeratorPosition deserialize(int version, byte[] serialized) throws IOException { + switch (version) { + case 1: + return deserializeV1(serialized); + default: + throw new IOException("Unknown version: " + version); + } + } + + private byte[] serializeV1(IcebergEnumeratorPosition position) throws IOException { + DataOutputSerializer out = SERIALIZER_CACHE.get(); + out.writeBoolean(position.snapshotId() != null); + if (position.snapshotId() != null) { + out.writeLong(position.snapshotId()); + } + out.writeBoolean(position.snapshotTimestampMs() != null); + if (position.snapshotTimestampMs() != null) { + out.writeLong(position.snapshotTimestampMs()); + } + byte[] result = out.getCopyOfBuffer(); + out.clear(); + return result; + } + + private IcebergEnumeratorPosition deserializeV1(byte[] serialized) throws IOException { + DataInputDeserializer in = new DataInputDeserializer(serialized); + Long snapshotId = null; + if (in.readBoolean()) { + snapshotId = in.readLong(); + } + + Long snapshotTimestampMs = null; + if (in.readBoolean()) { + snapshotTimestampMs = in.readLong(); + } + + if (snapshotId != null) { + return IcebergEnumeratorPosition.of(snapshotId, snapshotTimestampMs); + } else { + return IcebergEnumeratorPosition.empty(); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java new file mode 100644 index 000000000000..26fbad46c128 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.io.Serializable; +import java.util.Collection; +import javax.annotation.Nullable; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; + +/** Enumerator state for checkpointing */ +@Internal +public class IcebergEnumeratorState implements Serializable { + @Nullable private final IcebergEnumeratorPosition lastEnumeratedPosition; + private final Collection pendingSplits; + private final int[] enumerationSplitCountHistory; + + public IcebergEnumeratorState(Collection pendingSplits) { + this(null, pendingSplits); + } + + public IcebergEnumeratorState( + @Nullable IcebergEnumeratorPosition lastEnumeratedPosition, + Collection pendingSplits) { + this(lastEnumeratedPosition, pendingSplits, new int[0]); + } + + public IcebergEnumeratorState( + @Nullable IcebergEnumeratorPosition lastEnumeratedPosition, + Collection pendingSplits, + int[] enumerationSplitCountHistory) { + this.lastEnumeratedPosition = lastEnumeratedPosition; + this.pendingSplits = pendingSplits; + this.enumerationSplitCountHistory = enumerationSplitCountHistory; + } + + @Nullable + public IcebergEnumeratorPosition lastEnumeratedPosition() { + return lastEnumeratedPosition; + } + + public Collection pendingSplits() { + return pendingSplits; + } + + public int[] enumerationSplitCountHistory() { + return enumerationSplitCountHistory; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java new file mode 100644 index 000000000000..f76f8a69ff0e --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.io.IOException; +import java.util.Collection; +import org.apache.flink.annotation.Internal; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitSerializer; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitStatus; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +@Internal +public class IcebergEnumeratorStateSerializer + implements SimpleVersionedSerializer { + + private static final int VERSION = 2; + + private static final ThreadLocal SERIALIZER_CACHE = + ThreadLocal.withInitial(() -> new DataOutputSerializer(1024)); + + private final IcebergEnumeratorPositionSerializer positionSerializer = + IcebergEnumeratorPositionSerializer.INSTANCE; + private final IcebergSourceSplitSerializer splitSerializer; + + public IcebergEnumeratorStateSerializer(boolean caseSensitive) { + this.splitSerializer = new IcebergSourceSplitSerializer(caseSensitive); + } + + @Override + public int getVersion() { + return VERSION; + } + + @Override + public byte[] serialize(IcebergEnumeratorState enumState) throws IOException { + return serializeV2(enumState); + } + + @Override + public IcebergEnumeratorState deserialize(int version, byte[] serialized) throws IOException { + switch (version) { + case 1: + return deserializeV1(serialized); + case 2: + return deserializeV2(serialized); + default: + throw new IOException("Unknown version: " + version); + } + } + + @VisibleForTesting + byte[] serializeV1(IcebergEnumeratorState enumState) throws IOException { + DataOutputSerializer out = SERIALIZER_CACHE.get(); + serializeEnumeratorPosition(out, enumState.lastEnumeratedPosition(), positionSerializer); + serializePendingSplits(out, enumState.pendingSplits(), splitSerializer); + byte[] result = out.getCopyOfBuffer(); + out.clear(); + return result; + } + + @VisibleForTesting + IcebergEnumeratorState deserializeV1(byte[] serialized) throws IOException { + DataInputDeserializer in = new DataInputDeserializer(serialized); + IcebergEnumeratorPosition enumeratorPosition = + deserializeEnumeratorPosition(in, positionSerializer); + Collection pendingSplits = + deserializePendingSplits(in, splitSerializer); + return new IcebergEnumeratorState(enumeratorPosition, pendingSplits); + } + + @VisibleForTesting + byte[] serializeV2(IcebergEnumeratorState enumState) throws IOException { + DataOutputSerializer out = SERIALIZER_CACHE.get(); + serializeEnumeratorPosition(out, enumState.lastEnumeratedPosition(), positionSerializer); + serializePendingSplits(out, enumState.pendingSplits(), splitSerializer); + serializeEnumerationSplitCountHistory(out, enumState.enumerationSplitCountHistory()); + byte[] result = out.getCopyOfBuffer(); + out.clear(); + return result; + } + + @VisibleForTesting + IcebergEnumeratorState deserializeV2(byte[] serialized) throws IOException { + DataInputDeserializer in = new DataInputDeserializer(serialized); + IcebergEnumeratorPosition enumeratorPosition = + deserializeEnumeratorPosition(in, positionSerializer); + Collection pendingSplits = + deserializePendingSplits(in, splitSerializer); + int[] enumerationSplitCountHistory = deserializeEnumerationSplitCountHistory(in); + return new IcebergEnumeratorState( + enumeratorPosition, pendingSplits, enumerationSplitCountHistory); + } + + private static void serializeEnumeratorPosition( + DataOutputSerializer out, + IcebergEnumeratorPosition enumeratorPosition, + IcebergEnumeratorPositionSerializer positionSerializer) + throws IOException { + out.writeBoolean(enumeratorPosition != null); + if (enumeratorPosition != null) { + out.writeInt(positionSerializer.getVersion()); + byte[] positionBytes = positionSerializer.serialize(enumeratorPosition); + out.writeInt(positionBytes.length); + out.write(positionBytes); + } + } + + private static IcebergEnumeratorPosition deserializeEnumeratorPosition( + DataInputDeserializer in, IcebergEnumeratorPositionSerializer positionSerializer) + throws IOException { + IcebergEnumeratorPosition enumeratorPosition = null; + if (in.readBoolean()) { + int version = in.readInt(); + byte[] positionBytes = new byte[in.readInt()]; + in.read(positionBytes); + enumeratorPosition = positionSerializer.deserialize(version, positionBytes); + } + return enumeratorPosition; + } + + private static void serializePendingSplits( + DataOutputSerializer out, + Collection pendingSplits, + IcebergSourceSplitSerializer splitSerializer) + throws IOException { + out.writeInt(splitSerializer.getVersion()); + out.writeInt(pendingSplits.size()); + for (IcebergSourceSplitState splitState : pendingSplits) { + byte[] splitBytes = splitSerializer.serialize(splitState.split()); + out.writeInt(splitBytes.length); + out.write(splitBytes); + out.writeUTF(splitState.status().name()); + } + } + + private static Collection deserializePendingSplits( + DataInputDeserializer in, IcebergSourceSplitSerializer splitSerializer) throws IOException { + int splitSerializerVersion = in.readInt(); + int splitCount = in.readInt(); + Collection pendingSplits = Lists.newArrayListWithCapacity(splitCount); + for (int i = 0; i < splitCount; ++i) { + byte[] splitBytes = new byte[in.readInt()]; + in.read(splitBytes); + IcebergSourceSplit split = splitSerializer.deserialize(splitSerializerVersion, splitBytes); + String statusName = in.readUTF(); + pendingSplits.add( + new IcebergSourceSplitState(split, IcebergSourceSplitStatus.valueOf(statusName))); + } + return pendingSplits; + } + + private static void serializeEnumerationSplitCountHistory( + DataOutputSerializer out, int[] enumerationSplitCountHistory) throws IOException { + out.writeInt(enumerationSplitCountHistory.length); + for (int enumerationSplitCount : enumerationSplitCountHistory) { + out.writeInt(enumerationSplitCount); + } + } + + private static int[] deserializeEnumerationSplitCountHistory(DataInputDeserializer in) + throws IOException { + int historySize = in.readInt(); + int[] history = new int[historySize]; + if (historySize > 0) { + for (int i = 0; i < historySize; ++i) { + history[i] = in.readInt(); + } + } + + return history; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java new file mode 100644 index 000000000000..4e55ea5d5fd6 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.iceberg.flink.source.assigner.SplitAssigner; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; + +/** One-time split enumeration at the start-up for batch execution */ +@Internal +public class StaticIcebergEnumerator extends AbstractIcebergEnumerator { + private final SplitAssigner assigner; + + public StaticIcebergEnumerator( + SplitEnumeratorContext enumeratorContext, SplitAssigner assigner) { + super(enumeratorContext, assigner); + this.assigner = assigner; + } + + @Override + public void start() { + super.start(); + } + + @Override + protected boolean shouldWaitForMoreSplits() { + return false; + } + + @Override + public IcebergEnumeratorState snapshotState(long checkpointId) { + return new IcebergEnumeratorState(null, assigner.state(), new int[0]); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java new file mode 100644 index 000000000000..7b94c364c976 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.Collections; +import java.util.Set; +import javax.annotation.Nullable; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.file.src.util.Pool; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * {@link RecordsWithSplitIds} is used to pass a batch of records from fetcher to source reader. + * Batching is to improve the efficiency for records handover. + * + *

    {@link RecordsWithSplitIds} interface can encapsulate batches from multiple splits. This is + * the case for Kafka source where fetchers can retrieve records from multiple Kafka partitions at + * the same time. + * + *

    For file-based sources like Iceberg, readers always read one split/file at a time. Hence, we + * will only have a batch of records for one split here. + * + *

    This class uses array to store a batch of records from the same file (with the same + * fileOffset). + */ +class ArrayBatchRecords implements RecordsWithSplitIds> { + @Nullable private String splitId; + @Nullable private final Pool.Recycler recycler; + @Nullable private final T[] records; + private final int numberOfRecords; + private final Set finishedSplits; + private final RecordAndPosition recordAndPosition; + + // point to current read position within the records array + private int position; + + private ArrayBatchRecords( + @Nullable String splitId, + @Nullable Pool.Recycler recycler, + @Nullable T[] records, + int numberOfRecords, + int fileOffset, + long startingRecordOffset, + Set finishedSplits) { + Preconditions.checkArgument(numberOfRecords >= 0, "numberOfRecords can't be negative"); + Preconditions.checkArgument(fileOffset >= 0, "fileOffset can't be negative"); + Preconditions.checkArgument(startingRecordOffset >= 0, "numberOfRecords can't be negative"); + + this.splitId = splitId; + this.recycler = recycler; + this.records = records; + this.numberOfRecords = numberOfRecords; + this.finishedSplits = + Preconditions.checkNotNull(finishedSplits, "finishedSplits can be empty but not null"); + this.recordAndPosition = new RecordAndPosition<>(); + + recordAndPosition.set(null, fileOffset, startingRecordOffset); + this.position = 0; + } + + @Nullable + @Override + public String nextSplit() { + String nextSplit = this.splitId; + // set the splitId to null to indicate no more splits + // this class only contains record for one split + this.splitId = null; + return nextSplit; + } + + @Nullable + @Override + public RecordAndPosition nextRecordFromSplit() { + if (position < numberOfRecords) { + recordAndPosition.record(records[position]); + position++; + return recordAndPosition; + } else { + return null; + } + } + + /** + * This method is called when all records from this batch has been emitted. If recycler is set, it + * should be called to return the records array back to pool. + */ + @Override + public void recycle() { + if (recycler != null) { + recycler.recycle(records); + } + } + + @Override + public Set finishedSplits() { + return finishedSplits; + } + + @VisibleForTesting + T[] records() { + return records; + } + + @VisibleForTesting + int numberOfRecords() { + return numberOfRecords; + } + + /** + * Create a ArrayBatchRecords backed up an array with records from the same file + * + * @param splitId Iceberg source only read from one split a time. We never have multiple records + * from multiple splits. + * @param recycler Because {@link DataIterator} with {@link RowData} returns an iterator of reused + * RowData object, we need to clone RowData eagerly when constructing a batch of records. We + * can use object pool to reuse the RowData array object which can be expensive to create. + * This recycler can be provided to recycle the array object back to pool after read is + * exhausted. If the {@link DataIterator} returns an iterator of non-reused objects, we don't + * need to clone objects. It is cheap to just create the batch array. Hence, we don't need + * object pool and recycler can be set to null. + * @param records an array (maybe reused) holding a batch of records + * @param numberOfRecords actual number of records in the array + * @param fileOffset fileOffset for all records in this batch + * @param startingRecordOffset starting recordOffset + * @param record type + */ + public static ArrayBatchRecords forRecords( + String splitId, + Pool.Recycler recycler, + T[] records, + int numberOfRecords, + int fileOffset, + long startingRecordOffset) { + return new ArrayBatchRecords<>( + splitId, + recycler, + records, + numberOfRecords, + fileOffset, + startingRecordOffset, + Collections.emptySet()); + } + + /** + * Create ab ArrayBatchRecords with only finished split id + * + * @param splitId for the split that is just exhausted + */ + public static ArrayBatchRecords finishedSplit(String splitId) { + return new ArrayBatchRecords<>(null, null, null, 0, 0, 0, Collections.singleton(splitId)); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java new file mode 100644 index 000000000000..306afd1811be --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.IOException; +import java.util.NoSuchElementException; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.SourceReaderOptions; +import org.apache.flink.connector.file.src.util.Pool; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** This implementation stores record batch in array from recyclable pool */ +class ArrayPoolDataIteratorBatcher implements DataIteratorBatcher { + private final int batchSize; + private final int handoverQueueSize; + private final RecordFactory recordFactory; + + private transient Pool pool; + + ArrayPoolDataIteratorBatcher(ReadableConfig config, RecordFactory recordFactory) { + this.batchSize = config.get(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT); + this.handoverQueueSize = config.get(SourceReaderOptions.ELEMENT_QUEUE_CAPACITY); + this.recordFactory = recordFactory; + } + + @Override + public CloseableIterator>> batch( + String splitId, DataIterator inputIterator) { + Preconditions.checkArgument(inputIterator != null, "Input data iterator can't be null"); + // lazily create pool as it is not serializable + if (pool == null) { + this.pool = createPoolOfBatches(handoverQueueSize); + } + return new ArrayPoolBatchIterator(splitId, inputIterator, pool); + } + + private Pool createPoolOfBatches(int numBatches) { + Pool poolOfBatches = new Pool<>(numBatches); + for (int batchId = 0; batchId < numBatches; batchId++) { + T[] batch = recordFactory.createBatch(batchSize); + poolOfBatches.add(batch); + } + + return poolOfBatches; + } + + private class ArrayPoolBatchIterator + implements CloseableIterator>> { + + private final String splitId; + private final DataIterator inputIterator; + private final Pool pool; + + ArrayPoolBatchIterator(String splitId, DataIterator inputIterator, Pool pool) { + this.splitId = splitId; + this.inputIterator = inputIterator; + this.pool = pool; + } + + @Override + public boolean hasNext() { + return inputIterator.hasNext(); + } + + @Override + public RecordsWithSplitIds> next() { + if (!inputIterator.hasNext()) { + throw new NoSuchElementException(); + } + + T[] batch = getCachedEntry(); + int recordCount = 0; + while (inputIterator.hasNext() && recordCount < batchSize) { + // The record produced by inputIterator can be reused like for the RowData case. + // inputIterator.next() can't be called again until the copy is made + // since the record is not consumed immediately. + T nextRecord = inputIterator.next(); + recordFactory.clone(nextRecord, batch, recordCount); + recordCount++; + if (!inputIterator.currentFileHasNext()) { + // break early so that records in the ArrayResultIterator + // have the same fileOffset. + break; + } + } + + return ArrayBatchRecords.forRecords( + splitId, + pool.recycler(), + batch, + recordCount, + inputIterator.fileOffset(), + inputIterator.recordOffset() - recordCount); + } + + @Override + public void close() throws IOException { + inputIterator.close(); + } + + private T[] getCachedEntry() { + try { + return pool.pollEntry(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted while waiting for array pool entry", e); + } + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java new file mode 100644 index 000000000000..b158b0871a53 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.formats.avro.RowDataToAvroConverters; +import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; +import org.apache.flink.formats.avro.typeutils.GenericRecordAvroTypeInfo; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.flink.FlinkSchemaUtil; + +public class AvroGenericRecordConverter implements RowDataConverter { + private final Schema avroSchema; + private final RowDataToAvroConverters.RowDataToAvroConverter flinkConverter; + private final TypeInformation outputTypeInfo; + + private AvroGenericRecordConverter(Schema avroSchema, RowType rowType) { + this.avroSchema = avroSchema; + this.flinkConverter = RowDataToAvroConverters.createConverter(rowType); + this.outputTypeInfo = new GenericRecordAvroTypeInfo(avroSchema); + } + + public static AvroGenericRecordConverter fromIcebergSchema( + org.apache.iceberg.Schema icebergSchema, String tableName) { + RowType rowType = FlinkSchemaUtil.convert(icebergSchema); + Schema avroSchema = AvroSchemaUtil.convert(icebergSchema, tableName); + return new AvroGenericRecordConverter(avroSchema, rowType); + } + + public static AvroGenericRecordConverter fromAvroSchema(Schema avroSchema, String tableName) { + DataType dataType = AvroSchemaConverter.convertToDataType(avroSchema.toString()); + LogicalType logicalType = TypeConversions.fromDataToLogicalType(dataType); + RowType rowType = RowType.of(logicalType.getChildren().toArray(new LogicalType[0])); + return new AvroGenericRecordConverter(avroSchema, rowType); + } + + @Override + public GenericRecord apply(RowData rowData) { + return (GenericRecord) flinkConverter.convert(avroSchema, rowData); + } + + @Override + public TypeInformation getProducedType() { + return outputTypeInfo; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java new file mode 100644 index 000000000000..f89e5ce13474 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.List; +import org.apache.avro.generic.GenericRecord; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.source.AvroGenericRecordFileScanTaskReader; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.IcebergSource; +import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; +import org.apache.iceberg.flink.source.RowDataToAvroGenericRecordConverter; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * Read Iceberg rows as {@link GenericRecord}. + * + * @deprecated since 1.7.0. Will be removed in 2.0.0; use {@link + * IcebergSource#forOutputType(RowDataConverter)} and {@link AvroGenericRecordConverter} + * instead. + */ +@Deprecated +public class AvroGenericRecordReaderFunction extends DataIteratorReaderFunction { + private final String tableName; + private final Schema readSchema; + private final FileIO io; + private final EncryptionManager encryption; + private final RowDataFileScanTaskReader rowDataReader; + + private transient RowDataToAvroGenericRecordConverter converter; + + /** + * Create a reader function without projection and name mapping. Column name is case-insensitive. + */ + public static AvroGenericRecordReaderFunction fromTable(Table table) { + return new AvroGenericRecordReaderFunction( + table.name(), + new Configuration(), + table.schema(), + null, + null, + false, + table.io(), + table.encryption(), + null); + } + + public AvroGenericRecordReaderFunction( + String tableName, + ReadableConfig config, + Schema tableSchema, + Schema projectedSchema, + String nameMapping, + boolean caseSensitive, + FileIO io, + EncryptionManager encryption, + List filters) { + super(new ListDataIteratorBatcher<>(config)); + this.tableName = tableName; + this.readSchema = readSchema(tableSchema, projectedSchema); + this.io = io; + this.encryption = encryption; + this.rowDataReader = + new RowDataFileScanTaskReader(tableSchema, readSchema, nameMapping, caseSensitive, filters); + } + + @Override + protected DataIterator createDataIterator(IcebergSourceSplit split) { + return new DataIterator<>( + new AvroGenericRecordFileScanTaskReader(rowDataReader, lazyConverter()), + split.task(), + io, + encryption); + } + + private RowDataToAvroGenericRecordConverter lazyConverter() { + if (converter == null) { + this.converter = RowDataToAvroGenericRecordConverter.fromIcebergSchema(tableName, readSchema); + } + return converter; + } + + private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { + Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); + return projectedSchema == null ? tableSchema : projectedSchema; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java new file mode 100644 index 000000000000..4bb6f0a98c4c --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.Serializable; +import java.util.Comparator; +import java.util.concurrent.TimeUnit; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.types.Type.TypeID; +import org.apache.iceberg.types.Types; + +/** + * {@link SplitWatermarkExtractor} implementation which uses an Iceberg timestamp column statistics + * to get the watermarks for the {@link IcebergSourceSplit}. This watermark is emitted by the {@link + * WatermarkExtractorRecordEmitter} along with the actual records. + */ +@Internal +public class ColumnStatsWatermarkExtractor implements SplitWatermarkExtractor, Serializable { + private final int eventTimeFieldId; + private final String eventTimeFieldName; + private final TimeUnit timeUnit; + + /** + * Creates the extractor. + * + * @param schema The schema of the Table + * @param eventTimeFieldName The column which should be used as an event time + * @param timeUnit Used for converting the long value to epoch milliseconds + */ + public ColumnStatsWatermarkExtractor( + Schema schema, String eventTimeFieldName, TimeUnit timeUnit) { + Types.NestedField field = schema.findField(eventTimeFieldName); + TypeID typeID = field.type().typeId(); + Preconditions.checkArgument( + typeID.equals(TypeID.LONG) || typeID.equals(TypeID.TIMESTAMP), + "Found %s, expected a LONG or TIMESTAMP column for watermark generation.", + typeID); + this.eventTimeFieldId = field.fieldId(); + this.eventTimeFieldName = eventTimeFieldName; + // Use the timeUnit only for Long columns. + this.timeUnit = typeID.equals(TypeID.LONG) ? timeUnit : TimeUnit.MICROSECONDS; + } + + @VisibleForTesting + ColumnStatsWatermarkExtractor(int eventTimeFieldId, String eventTimeFieldName) { + this.eventTimeFieldId = eventTimeFieldId; + this.eventTimeFieldName = eventTimeFieldName; + this.timeUnit = TimeUnit.MICROSECONDS; + } + + /** + * Get the watermark for a split using column statistics. + * + * @param split The split + * @return The watermark + * @throws IllegalArgumentException if there is no statistics for the column + */ + @Override + public long extractWatermark(IcebergSourceSplit split) { + return split.task().files().stream() + .map( + scanTask -> { + Preconditions.checkArgument( + scanTask.file().lowerBounds() != null + && scanTask.file().lowerBounds().get(eventTimeFieldId) != null, + "Missing statistics for column name = %s in file = %s", + eventTimeFieldName, + eventTimeFieldId, + scanTask.file()); + return timeUnit.toMillis( + Conversions.fromByteBuffer( + Types.LongType.get(), scanTask.file().lowerBounds().get(eventTimeFieldId))); + }) + .min(Comparator.comparingLong(l -> l)) + .get(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ConverterReaderFunction.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ConverterReaderFunction.java new file mode 100644 index 000000000000..e1e7c17d63c5 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ConverterReaderFunction.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.List; +import org.apache.flink.annotation.Internal; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.Schema; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.encryption.InputFilesDecryptor; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.FileScanTaskReader; +import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +@Internal +public class ConverterReaderFunction extends DataIteratorReaderFunction { + private final RowDataConverter converter; + private final Schema tableSchema; + private final Schema readSchema; + private final String nameMapping; + private final boolean caseSensitive; + private final FileIO io; + private final EncryptionManager encryption; + private final List filters; + private final long limit; + + private transient RecordLimiter recordLimiter = null; + + public ConverterReaderFunction( + RowDataConverter converter, + ReadableConfig config, + Schema tableSchema, + Schema projectedSchema, + String nameMapping, + boolean caseSensitive, + FileIO io, + EncryptionManager encryption, + List filters, + long limit) { + super(new ListDataIteratorBatcher<>(config)); + this.converter = converter; + this.tableSchema = tableSchema; + this.readSchema = readSchema(tableSchema, projectedSchema); + this.nameMapping = nameMapping; + this.caseSensitive = caseSensitive; + this.io = io; + this.encryption = encryption; + this.filters = filters; + this.limit = limit; + } + + @Override + protected DataIterator createDataIterator(IcebergSourceSplit split) { + RowDataFileScanTaskReader rowDataReader = + new RowDataFileScanTaskReader(tableSchema, readSchema, nameMapping, caseSensitive, filters); + return new LimitableDataIterator<>( + new ConverterFileScanTaskReader<>(rowDataReader, converter), + split.task(), + io, + encryption, + lazyLimiter()); + } + + private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { + Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); + return projectedSchema == null ? tableSchema : projectedSchema; + } + + /** Lazily create RecordLimiter to avoid the need to make it serializable */ + private RecordLimiter lazyLimiter() { + if (recordLimiter == null) { + this.recordLimiter = RecordLimiter.create(limit); + } + + return recordLimiter; + } + + private static class ConverterFileScanTaskReader implements FileScanTaskReader { + private final RowDataFileScanTaskReader rowDataReader; + private final RowDataConverter converter; + + ConverterFileScanTaskReader( + RowDataFileScanTaskReader rowDataReader, RowDataConverter converter) { + this.rowDataReader = rowDataReader; + this.converter = converter; + } + + @Override + public CloseableIterator open( + FileScanTask fileScanTask, InputFilesDecryptor inputFilesDecryptor) { + return CloseableIterator.transform( + rowDataReader.open(fileScanTask, inputFilesDecryptor), converter); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java new file mode 100644 index 000000000000..c376e359c600 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.Serializable; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.io.CloseableIterator; + +/** + * Batcher converts iterator of T into iterator of batched {@code + * RecordsWithSplitIds>}, as FLIP-27's {@link SplitReader#fetch()} returns + * batched records. + */ +@FunctionalInterface +public interface DataIteratorBatcher extends Serializable { + CloseableIterator>> batch( + String splitId, DataIterator inputIterator); +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java new file mode 100644 index 000000000000..bbf797ef4aa8 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.io.CloseableIterator; + +/** A {@link ReaderFunction} implementation that uses {@link DataIterator}. */ +public abstract class DataIteratorReaderFunction implements ReaderFunction { + private final DataIteratorBatcher batcher; + + public DataIteratorReaderFunction(DataIteratorBatcher batcher) { + this.batcher = batcher; + } + + protected abstract DataIterator createDataIterator(IcebergSourceSplit split); + + @Override + public CloseableIterator>> apply( + IcebergSourceSplit split) { + DataIterator inputIterator = createDataIterator(split); + inputIterator.seek(split.fileOffset(), split.recordOffset()); + return batcher.batch(split.splitId(), inputIterator); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java new file mode 100644 index 000000000000..f143b8d2df2e --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.Collection; +import java.util.Collections; +import java.util.Map; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.connector.base.source.reader.SingleThreadMultiplexSourceReaderBase; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.SerializableComparator; +import org.apache.iceberg.flink.source.split.SplitRequestEvent; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +@Internal +public class IcebergSourceReader + extends SingleThreadMultiplexSourceReaderBase< + RecordAndPosition, T, IcebergSourceSplit, IcebergSourceSplit> { + + public IcebergSourceReader( + SerializableRecordEmitter emitter, + IcebergSourceReaderMetrics metrics, + ReaderFunction readerFunction, + SerializableComparator splitComparator, + SourceReaderContext context) { + super( + () -> new IcebergSourceSplitReader<>(metrics, readerFunction, splitComparator, context), + emitter, + context.getConfiguration(), + context); + } + + @Override + public void start() { + // We request a split only if we did not get splits during the checkpoint restore. + // Otherwise, reader restarts will keep requesting more and more splits. + if (getNumberOfCurrentlyAssignedSplits() == 0) { + requestSplit(Collections.emptyList()); + } + } + + @Override + protected void onSplitFinished(Map finishedSplitIds) { + requestSplit(Lists.newArrayList(finishedSplitIds.keySet())); + } + + @Override + protected IcebergSourceSplit initializedState(IcebergSourceSplit split) { + return split; + } + + @Override + protected IcebergSourceSplit toSplitType(String splitId, IcebergSourceSplit splitState) { + return splitState; + } + + private void requestSplit(Collection finishedSplitIds) { + context.sendSourceEventToCoordinator(new SplitRequestEvent(finishedSplitIds)); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java new file mode 100644 index 000000000000..2a3e1dd86b95 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import org.apache.flink.metrics.Counter; +import org.apache.flink.metrics.MetricGroup; + +public class IcebergSourceReaderMetrics { + private final Counter assignedSplits; + private final Counter assignedBytes; + private final Counter finishedSplits; + private final Counter finishedBytes; + private final Counter splitReaderFetchCalls; + + public IcebergSourceReaderMetrics(MetricGroup metrics, String fullTableName) { + MetricGroup readerMetrics = + metrics.addGroup("IcebergSourceReader").addGroup("table", fullTableName); + + this.assignedSplits = readerMetrics.counter("assignedSplits"); + this.assignedBytes = readerMetrics.counter("assignedBytes"); + this.finishedSplits = readerMetrics.counter("finishedSplits"); + this.finishedBytes = readerMetrics.counter("finishedBytes"); + this.splitReaderFetchCalls = readerMetrics.counter("splitReaderFetchCalls"); + } + + public void incrementAssignedSplits(long count) { + assignedSplits.inc(count); + } + + public void incrementAssignedBytes(long count) { + assignedBytes.inc(count); + } + + public void incrementFinishedSplits(long count) { + finishedSplits.inc(count); + } + + public void incrementFinishedBytes(long count) { + finishedBytes.inc(count); + } + + public void incrementSplitReaderFetchCalls(long count) { + splitReaderFetchCalls.inc(count); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java new file mode 100644 index 000000000000..bcd72e25036b --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Queue; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.connector.base.source.reader.RecordsBySplits; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; +import org.apache.flink.connector.base.source.reader.splitreader.SplitsAddition; +import org.apache.flink.connector.base.source.reader.splitreader.SplitsChange; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.SerializableComparator; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Queues; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class IcebergSourceSplitReader implements SplitReader, IcebergSourceSplit> { + private static final Logger LOG = LoggerFactory.getLogger(IcebergSourceSplitReader.class); + + private final IcebergSourceReaderMetrics metrics; + private final ReaderFunction openSplitFunction; + private final SerializableComparator splitComparator; + private final int indexOfSubtask; + private final Queue splits; + + private CloseableIterator>> currentReader; + private IcebergSourceSplit currentSplit; + private String currentSplitId; + + IcebergSourceSplitReader( + IcebergSourceReaderMetrics metrics, + ReaderFunction openSplitFunction, + SerializableComparator splitComparator, + SourceReaderContext context) { + this.metrics = metrics; + this.openSplitFunction = openSplitFunction; + this.splitComparator = splitComparator; + this.indexOfSubtask = context.getIndexOfSubtask(); + this.splits = Queues.newArrayDeque(); + } + + /** + * The method reads a batch of records from the assigned splits. If all the records from the + * current split are returned then it will emit a {@link ArrayBatchRecords#finishedSplit(String)} + * batch to signal this event. In the next fetch loop the reader will continue with the next split + * (if any). + * + * @return The fetched records + * @throws IOException If there is an error during reading + */ + @Override + public RecordsWithSplitIds> fetch() throws IOException { + metrics.incrementSplitReaderFetchCalls(1); + if (currentReader == null) { + IcebergSourceSplit nextSplit = splits.poll(); + if (nextSplit != null) { + currentSplit = nextSplit; + currentSplitId = nextSplit.splitId(); + currentReader = openSplitFunction.apply(currentSplit); + } else { + // return an empty result, which will lead to split fetch to be idle. + // SplitFetcherManager will then close idle fetcher. + return new RecordsBySplits<>(Collections.emptyMap(), Collections.emptySet()); + } + } + + if (currentReader.hasNext()) { + // Because Iterator#next() doesn't support checked exception, + // we need to wrap and unwrap the checked IOException with UncheckedIOException + try { + return currentReader.next(); + } catch (UncheckedIOException e) { + throw e.getCause(); + } + } else { + return finishSplit(); + } + } + + @Override + public void handleSplitsChanges(SplitsChange splitsChange) { + if (!(splitsChange instanceof SplitsAddition)) { + throw new UnsupportedOperationException( + String.format("Unsupported split change: %s", splitsChange.getClass())); + } + + if (splitComparator != null) { + List newSplits = Lists.newArrayList(splitsChange.splits()); + newSplits.sort(splitComparator); + LOG.info("Add {} splits to reader: {}", newSplits.size(), newSplits); + splits.addAll(newSplits); + } else { + LOG.info("Add {} splits to reader", splitsChange.splits().size()); + splits.addAll(splitsChange.splits()); + } + metrics.incrementAssignedSplits(splitsChange.splits().size()); + metrics.incrementAssignedBytes(calculateBytes(splitsChange)); + } + + @Override + public void wakeUp() {} + + @Override + public void close() throws Exception { + currentSplitId = null; + if (currentReader != null) { + currentReader.close(); + } + } + + @Override + public void pauseOrResumeSplits( + Collection splitsToPause, Collection splitsToResume) { + // IcebergSourceSplitReader only reads splits sequentially. When waiting for watermark alignment + // the SourceOperator will stop processing and recycling the fetched batches. This exhausts the + // {@link ArrayPoolDataIteratorBatcher#pool} and the `currentReader.next()` call will be + // blocked even without split-level watermark alignment. Based on this the + // `pauseOrResumeSplits` and the `wakeUp` are left empty. + } + + private long calculateBytes(IcebergSourceSplit split) { + return split.task().files().stream().map(FileScanTask::length).reduce(0L, Long::sum); + } + + private long calculateBytes(SplitsChange splitsChanges) { + return splitsChanges.splits().stream().map(this::calculateBytes).reduce(0L, Long::sum); + } + + private ArrayBatchRecords finishSplit() throws IOException { + if (currentReader != null) { + currentReader.close(); + currentReader = null; + } + + ArrayBatchRecords finishRecords = ArrayBatchRecords.finishedSplit(currentSplitId); + LOG.info("Split reader {} finished split: {}", indexOfSubtask, currentSplitId); + metrics.incrementFinishedSplits(1); + metrics.incrementFinishedBytes(calculateBytes(currentSplit)); + currentSplitId = null; + return finishRecords; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java new file mode 100644 index 000000000000..020e87646d05 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.FileScanTaskReader; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +class LimitableDataIterator extends DataIterator { + private final RecordLimiter limiter; + + LimitableDataIterator( + FileScanTaskReader fileScanTaskReader, + CombinedScanTask task, + FileIO io, + EncryptionManager encryption, + RecordLimiter limiter) { + super(fileScanTaskReader, task, io, encryption); + Preconditions.checkArgument(limiter != null, "Invalid record limiter: null"); + this.limiter = limiter; + } + + @Override + public boolean hasNext() { + if (limiter.reachedLimit()) { + return false; + } + + return super.hasNext(); + } + + @Override + public T next() { + limiter.increment(); + return super.next(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java new file mode 100644 index 000000000000..1acb3df76102 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.Collections; +import java.util.List; +import java.util.Set; +import javax.annotation.Nullable; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +class ListBatchRecords implements RecordsWithSplitIds> { + private String splitId; + private final List records; + private final Set finishedSplits; + private final RecordAndPosition recordAndPosition; + + // point to current read position within the records list + private int position; + + ListBatchRecords( + String splitId, + List records, + int fileOffset, + long startingRecordOffset, + Set finishedSplits) { + this.splitId = splitId; + this.records = records; + this.finishedSplits = + Preconditions.checkNotNull(finishedSplits, "finishedSplits can be empty but not null"); + this.recordAndPosition = new RecordAndPosition<>(); + this.recordAndPosition.set(null, fileOffset, startingRecordOffset); + + this.position = 0; + } + + @Nullable + @Override + public String nextSplit() { + String nextSplit = this.splitId; + // set the splitId to null to indicate no more splits + // this class only contains record for one split + this.splitId = null; + return nextSplit; + } + + @Nullable + @Override + public RecordAndPosition nextRecordFromSplit() { + if (position < records.size()) { + recordAndPosition.record(records.get(position)); + position++; + return recordAndPosition; + } else { + return null; + } + } + + @Override + public Set finishedSplits() { + return finishedSplits; + } + + public static ListBatchRecords forRecords( + String splitId, List records, int fileOffset, long startingRecordOffset) { + return new ListBatchRecords<>( + splitId, records, fileOffset, startingRecordOffset, Collections.emptySet()); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java new file mode 100644 index 000000000000..365416239d37 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.IOException; +import java.util.List; +import java.util.NoSuchElementException; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +/** + * FlinkRecordReaderFunction essentially cloned objects already. So there is no need to use array + * pool to clone objects. Simply create a new ArrayList for each batch. + */ +class ListDataIteratorBatcher implements DataIteratorBatcher { + + private final int batchSize; + + ListDataIteratorBatcher(ReadableConfig config) { + this.batchSize = config.get(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT); + } + + @Override + public CloseableIterator>> batch( + String splitId, DataIterator dataIterator) { + return new ListBatchIterator(splitId, dataIterator); + } + + private class ListBatchIterator + implements CloseableIterator>> { + + private final String splitId; + private final DataIterator inputIterator; + + ListBatchIterator(String splitId, DataIterator inputIterator) { + this.splitId = splitId; + this.inputIterator = inputIterator; + } + + @Override + public boolean hasNext() { + return inputIterator.hasNext(); + } + + @Override + public RecordsWithSplitIds> next() { + if (!inputIterator.hasNext()) { + throw new NoSuchElementException(); + } + + final List batch = Lists.newArrayListWithCapacity(batchSize); + int recordCount = 0; + while (inputIterator.hasNext() && recordCount < batchSize) { + T nextRecord = inputIterator.next(); + batch.add(nextRecord); + recordCount++; + if (!inputIterator.currentFileHasNext()) { + // break early so that records have the same fileOffset. + break; + } + } + + return ListBatchRecords.forRecords( + splitId, batch, inputIterator.fileOffset(), inputIterator.recordOffset() - recordCount); + } + + @Override + public void close() throws IOException { + if (inputIterator != null) { + inputIterator.close(); + } + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java new file mode 100644 index 000000000000..fb4466913b90 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.DataTaskReader; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** Reading metadata tables (like snapshots, manifests, etc.) */ +@Internal +public class MetaDataReaderFunction extends DataIteratorReaderFunction { + private final Schema readSchema; + private final FileIO io; + private final EncryptionManager encryption; + + public MetaDataReaderFunction( + ReadableConfig config, + Schema tableSchema, + Schema projectedSchema, + FileIO io, + EncryptionManager encryption) { + super( + new ArrayPoolDataIteratorBatcher<>( + config, + new RowDataRecordFactory( + FlinkSchemaUtil.convert(readSchema(tableSchema, projectedSchema))))); + this.readSchema = readSchema(tableSchema, projectedSchema); + this.io = io; + this.encryption = encryption; + } + + @Override + public DataIterator createDataIterator(IcebergSourceSplit split) { + return new DataIterator<>(new DataTaskReader(readSchema), split.task(), io, encryption); + } + + private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { + Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); + return projectedSchema == null ? tableSchema : projectedSchema; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java new file mode 100644 index 000000000000..1ea91f10b4e7 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.Serializable; +import java.util.function.Function; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.io.CloseableIterator; + +@FunctionalInterface +public interface ReaderFunction + extends Serializable, + Function< + IcebergSourceSplit, CloseableIterator>>> {} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java new file mode 100644 index 000000000000..10e7d2037a30 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.Locale; +import org.apache.flink.annotation.Internal; + +/** + * A record along with the reader position to be stored in the checkpoint. + * + *

    The position defines the point in the reader AFTER the record. Record processing and updating + * checkpointed state happens atomically. The position points to where the reader should resume + * after this record is processed. + * + *

    This mutable object is useful in cases where only one instance of a {@code RecordAndPosition} + * is needed at a time. Then the same instance of RecordAndPosition can be reused. + */ +@Internal +public class RecordAndPosition { + private T record; + private int fileOffset; + private long recordOffset; + + public RecordAndPosition(T record, int fileOffset, long recordOffset) { + this.record = record; + this.fileOffset = fileOffset; + this.recordOffset = recordOffset; + } + + public RecordAndPosition() {} + + // ------------------------------------------------------------------------ + + public T record() { + return record; + } + + public int fileOffset() { + return fileOffset; + } + + public long recordOffset() { + return recordOffset; + } + + /** Updates the record and position in this object. */ + public void set(T newRecord, int newFileOffset, long newRecordOffset) { + this.record = newRecord; + this.fileOffset = newFileOffset; + this.recordOffset = newRecordOffset; + } + + /** Sets the next record of a sequence. This increments the {@code recordOffset} by one. */ + public void record(T nextRecord) { + this.record = nextRecord; + this.recordOffset++; + } + + @Override + public String toString() { + return String.format(Locale.ROOT, "%s @ %d + %d", record, fileOffset, recordOffset); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java new file mode 100644 index 000000000000..ef92e2e6b81f --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.Serializable; + +/** + * In FLIP-27 source, SplitReader#fetch() returns a batch of records. Since DataIterator for RowData + * returns an iterator of reused RowData objects, RecordFactory is needed to (1) create object array + * that is recyclable via pool. (2) clone RowData element from DataIterator to the batch array. + */ +interface RecordFactory extends Serializable { + /** Create a batch of records */ + T[] createBatch(int batchSize); + + /** Clone record into the specified position of the batch array */ + void clone(T from, T[] batch, int position); +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java new file mode 100644 index 000000000000..f260a53089ff --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.concurrent.atomic.AtomicLong; +import org.apache.flink.annotation.Internal; + +@Internal +class RecordLimiter { + private final long limit; + private final AtomicLong counter; + + static RecordLimiter create(long limit) { + return new RecordLimiter(limit); + } + + private RecordLimiter(long limit) { + this.limit = limit; + this.counter = new AtomicLong(0); + } + + public boolean reachedLimit() { + return limit > 0 && counter.get() >= limit; + } + + public void increment() { + counter.incrementAndGet(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowConverter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowConverter.java new file mode 100644 index 000000000000..0e028ff91b87 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowConverter.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.conversion.DataStructureConverter; +import org.apache.flink.table.data.conversion.DataStructureConverters; +import org.apache.flink.table.runtime.typeutils.ExternalTypeInfo; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.flink.types.Row; +import org.apache.iceberg.flink.FlinkSchemaUtil; + +public class RowConverter implements RowDataConverter { + private final DataStructureConverter converter; + private final TypeInformation outputTypeInfo; + + private RowConverter(RowType rowType, TypeInformation rowTypeInfo) { + this.converter = + DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(rowType)); + this.outputTypeInfo = rowTypeInfo; + } + + public static RowConverter fromIcebergSchema(org.apache.iceberg.Schema icebergSchema) { + RowType rowType = FlinkSchemaUtil.convert(icebergSchema); + ResolvedSchema resolvedSchema = FlinkSchemaUtil.toResolvedSchema(icebergSchema); + TypeInformation[] types = + resolvedSchema.getColumnDataTypes().stream() + .map(ExternalTypeInfo::of) + .toArray(TypeInformation[]::new); + String[] fieldNames = resolvedSchema.getColumnNames().toArray(String[]::new); + RowTypeInfo rowTypeInfo = new RowTypeInfo(types, fieldNames); + return new RowConverter(rowType, rowTypeInfo); + } + + @Override + public Row apply(RowData rowData) { + return (Row) converter.toExternal(rowData); + } + + @Override + public TypeInformation getProducedType() { + return outputTypeInfo; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataConverter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataConverter.java new file mode 100644 index 000000000000..98bb7e981840 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataConverter.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.Serializable; +import java.util.function.Function; +import org.apache.flink.api.java.typeutils.ResultTypeQueryable; +import org.apache.flink.table.data.RowData; + +/** + * Convert RowData to a different output type. + * + * @param output type + */ +public interface RowDataConverter + extends Function, ResultTypeQueryable, Serializable {} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java new file mode 100644 index 000000000000..c9208a0e1834 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.List; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +public class RowDataReaderFunction extends DataIteratorReaderFunction { + private final Schema tableSchema; + private final Schema readSchema; + private final String nameMapping; + private final boolean caseSensitive; + private final FileIO io; + private final EncryptionManager encryption; + private final List filters; + private final long limit; + + private transient RecordLimiter recordLimiter = null; + + public RowDataReaderFunction( + ReadableConfig config, + Schema tableSchema, + Schema projectedSchema, + String nameMapping, + boolean caseSensitive, + FileIO io, + EncryptionManager encryption, + List filters) { + this( + config, + tableSchema, + projectedSchema, + nameMapping, + caseSensitive, + io, + encryption, + filters, + -1L); + } + + public RowDataReaderFunction( + ReadableConfig config, + Schema tableSchema, + Schema projectedSchema, + String nameMapping, + boolean caseSensitive, + FileIO io, + EncryptionManager encryption, + List filters, + long limit) { + super( + new ArrayPoolDataIteratorBatcher<>( + config, + new RowDataRecordFactory( + FlinkSchemaUtil.convert(readSchema(tableSchema, projectedSchema))))); + this.tableSchema = tableSchema; + this.readSchema = readSchema(tableSchema, projectedSchema); + this.nameMapping = nameMapping; + this.caseSensitive = caseSensitive; + this.io = io; + this.encryption = encryption; + this.filters = filters; + this.limit = limit; + } + + @Override + public DataIterator createDataIterator(IcebergSourceSplit split) { + return new LimitableDataIterator<>( + new RowDataFileScanTaskReader(tableSchema, readSchema, nameMapping, caseSensitive, filters), + split.task(), + io, + encryption, + lazyLimiter()); + } + + private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { + Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); + return projectedSchema == null ? tableSchema : projectedSchema; + } + + /** Lazily create RecordLimiter to avoid the need to make it serializable */ + private RecordLimiter lazyLimiter() { + if (recordLimiter == null) { + this.recordLimiter = RecordLimiter.create(limit); + } + + return recordLimiter; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java new file mode 100644 index 000000000000..ef2eedcf3cdd --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.InternalSerializers; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.flink.FlinkRowData; +import org.apache.iceberg.flink.data.RowDataUtil; + +class RowDataRecordFactory implements RecordFactory { + private final RowType rowType; + private final TypeSerializer[] fieldSerializers; + private final RowData.FieldGetter[] fieldGetters; + + RowDataRecordFactory(RowType rowType) { + this.rowType = rowType; + this.fieldSerializers = createFieldSerializers(rowType); + this.fieldGetters = createFieldGetters(rowType); + } + + static TypeSerializer[] createFieldSerializers(RowType rowType) { + return rowType.getChildren().stream() + .map(InternalSerializers::create) + .toArray(TypeSerializer[]::new); + } + + static RowData.FieldGetter[] createFieldGetters(RowType rowType) { + RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[rowType.getFieldCount()]; + for (int i = 0; i < rowType.getFieldCount(); ++i) { + fieldGetters[i] = FlinkRowData.createFieldGetter(rowType.getTypeAt(i), i); + } + + return fieldGetters; + } + + @Override + public RowData[] createBatch(int batchSize) { + RowData[] arr = new RowData[batchSize]; + for (int i = 0; i < batchSize; ++i) { + arr[i] = new GenericRowData(rowType.getFieldCount()); + } + return arr; + } + + @Override + public void clone(RowData from, RowData[] batch, int position) { + // Set the return value from RowDataUtil.clone back to the array. + // Clone method returns same clone target object (reused) if it is a GenericRowData. + // Clone method will allocate a new GenericRowData object + // if the target object is NOT a GenericRowData. + // So we should always set the clone return value back to the array. + batch[position] = + RowDataUtil.clone(from, batch[position], rowType, fieldSerializers, fieldGetters); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java new file mode 100644 index 000000000000..a6e2c1dae243 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.Serializable; +import org.apache.flink.annotation.Internal; +import org.apache.flink.connector.base.source.reader.RecordEmitter; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; + +@Internal +@FunctionalInterface +public interface SerializableRecordEmitter + extends RecordEmitter, T, IcebergSourceSplit>, Serializable { + static SerializableRecordEmitter defaultEmitter() { + return (element, output, split) -> { + output.collect(element.record()); + split.updatePosition(element.fileOffset(), element.recordOffset()); + }; + } + + static SerializableRecordEmitter emitterWithWatermark(SplitWatermarkExtractor extractor) { + return new WatermarkExtractorRecordEmitter<>(extractor); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java new file mode 100644 index 000000000000..d1c50ac8ca52 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.Serializable; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; + +/** The interface used to extract watermarks from splits. */ +public interface SplitWatermarkExtractor extends Serializable { + /** Get the watermark for a split. */ + long extractWatermark(IcebergSourceSplit split); +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java new file mode 100644 index 000000000000..02ef57d344b1 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import org.apache.flink.api.common.eventtime.Watermark; +import org.apache.flink.api.connector.source.SourceOutput; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Emitter which emits the watermarks, records and updates the split position. + * + *

    The Emitter emits watermarks at the beginning of every split provided by the {@link + * SplitWatermarkExtractor}. + */ +class WatermarkExtractorRecordEmitter implements SerializableRecordEmitter { + private static final Logger LOG = LoggerFactory.getLogger(WatermarkExtractorRecordEmitter.class); + private final SplitWatermarkExtractor timeExtractor; + private String lastSplitId = null; + private long watermark; + + WatermarkExtractorRecordEmitter(SplitWatermarkExtractor timeExtractor) { + this.timeExtractor = timeExtractor; + } + + @Override + public void emitRecord( + RecordAndPosition element, SourceOutput output, IcebergSourceSplit split) { + if (!split.splitId().equals(lastSplitId)) { + long newWatermark = timeExtractor.extractWatermark(split); + if (newWatermark < watermark) { + LOG.info( + "Received a new split with lower watermark. Previous watermark = {}, current watermark = {}, previous split = {}, current split = {}", + watermark, + newWatermark, + lastSplitId, + split.splitId()); + } else { + watermark = newWatermark; + output.emitWatermark(new Watermark(watermark)); + LOG.debug("Watermark = {} emitted based on split = {}", watermark, lastSplitId); + } + + lastSplitId = split.splitId(); + } + + output.collect(element.record()); + split.updatePosition(element.fileOffset(), element.recordOffset()); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java new file mode 100644 index 000000000000..b6d6f60ef673 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java @@ -0,0 +1,220 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.split; + +import java.io.IOException; +import java.io.Serializable; +import java.util.Collection; +import java.util.List; +import java.util.stream.Collectors; +import javax.annotation.Nullable; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.connector.source.SourceSplit; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.flink.util.InstantiationUtil; +import org.apache.iceberg.BaseCombinedScanTask; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.ScanTaskParser; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +@Internal +public class IcebergSourceSplit implements SourceSplit, Serializable { + private static final long serialVersionUID = 1L; + private static final ThreadLocal SERIALIZER_CACHE = + ThreadLocal.withInitial(() -> new DataOutputSerializer(1024)); + + private final CombinedScanTask task; + + private int fileOffset; + private long recordOffset; + + // The splits are frequently serialized into checkpoints. + // Caching the byte representation makes repeated serialization cheap. + @Nullable private transient byte[] serializedBytesCache; + + private IcebergSourceSplit(CombinedScanTask task, int fileOffset, long recordOffset) { + this.task = task; + this.fileOffset = fileOffset; + this.recordOffset = recordOffset; + } + + public static IcebergSourceSplit fromCombinedScanTask(CombinedScanTask combinedScanTask) { + return fromCombinedScanTask(combinedScanTask, 0, 0L); + } + + public static IcebergSourceSplit fromCombinedScanTask( + CombinedScanTask combinedScanTask, int fileOffset, long recordOffset) { + return new IcebergSourceSplit(combinedScanTask, fileOffset, recordOffset); + } + + public CombinedScanTask task() { + return task; + } + + public int fileOffset() { + return fileOffset; + } + + public long recordOffset() { + return recordOffset; + } + + @Override + public String splitId() { + return MoreObjects.toStringHelper(this).add("files", toString(task.files())).toString(); + } + + public void updatePosition(int newFileOffset, long newRecordOffset) { + // invalidate the cache after position change + serializedBytesCache = null; + fileOffset = newFileOffset; + recordOffset = newRecordOffset; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("files", toString(task.files())) + .add("fileOffset", fileOffset) + .add("recordOffset", recordOffset) + .toString(); + } + + private String toString(Collection files) { + return Iterables.toString( + files.stream() + .map( + fileScanTask -> + MoreObjects.toStringHelper(fileScanTask) + .add("file", fileScanTask.file().location()) + .add("start", fileScanTask.start()) + .add("length", fileScanTask.length()) + .toString()) + .collect(Collectors.toList())); + } + + byte[] serializeV1() throws IOException { + if (serializedBytesCache == null) { + serializedBytesCache = InstantiationUtil.serializeObject(this); + } + + return serializedBytesCache; + } + + static IcebergSourceSplit deserializeV1(byte[] serialized) throws IOException { + try { + return InstantiationUtil.deserializeObject( + serialized, IcebergSourceSplit.class.getClassLoader()); + } catch (ClassNotFoundException e) { + throw new RuntimeException("Failed to deserialize the split.", e); + } + } + + byte[] serializeV2() throws IOException { + return serialize(2); + } + + byte[] serializeV3() throws IOException { + return serialize(3); + } + + private byte[] serialize(int version) throws IOException { + if (serializedBytesCache == null) { + DataOutputSerializer out = SERIALIZER_CACHE.get(); + Collection fileScanTasks = task.tasks(); + Preconditions.checkArgument( + fileOffset >= 0 && fileOffset < fileScanTasks.size(), + "Invalid file offset: %s. Should be within the range of [0, %s)", + fileOffset, + fileScanTasks.size()); + + out.writeInt(fileOffset); + out.writeLong(recordOffset); + out.writeInt(fileScanTasks.size()); + + for (FileScanTask fileScanTask : fileScanTasks) { + String taskJson = ScanTaskParser.toJson(fileScanTask); + writeTaskJson(out, taskJson, version); + } + + serializedBytesCache = out.getCopyOfBuffer(); + out.clear(); + } + + return serializedBytesCache; + } + + private static void writeTaskJson(DataOutputSerializer out, String taskJson, int version) + throws IOException { + switch (version) { + case 2: + out.writeUTF(taskJson); + break; + case 3: + SerializerHelper.writeLongUTF(out, taskJson); + break; + default: + throw new IllegalArgumentException("Unsupported version: " + version); + } + } + + static IcebergSourceSplit deserializeV2(byte[] serialized, boolean caseSensitive) + throws IOException { + return deserialize(serialized, caseSensitive, 2); + } + + static IcebergSourceSplit deserializeV3(byte[] serialized, boolean caseSensitive) + throws IOException { + return deserialize(serialized, caseSensitive, 3); + } + + private static IcebergSourceSplit deserialize( + byte[] serialized, boolean caseSensitive, int version) throws IOException { + DataInputDeserializer in = new DataInputDeserializer(serialized); + int fileOffset = in.readInt(); + long recordOffset = in.readLong(); + int taskCount = in.readInt(); + + List tasks = Lists.newArrayListWithCapacity(taskCount); + for (int i = 0; i < taskCount; ++i) { + String taskJson = readTaskJson(in, version); + FileScanTask task = ScanTaskParser.fromJson(taskJson, caseSensitive); + tasks.add(task); + } + + CombinedScanTask combinedScanTask = new BaseCombinedScanTask(tasks); + return IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, fileOffset, recordOffset); + } + + private static String readTaskJson(DataInputDeserializer in, int version) throws IOException { + switch (version) { + case 2: + return in.readUTF(); + case 3: + return SerializerHelper.readLongUTF(in); + default: + throw new IllegalArgumentException("Unsupported version: " + version); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java new file mode 100644 index 000000000000..d90d1dc88c91 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.split; + +import java.io.IOException; +import java.util.Locale; +import org.apache.flink.annotation.Internal; +import org.apache.flink.core.io.SimpleVersionedSerializer; + +@Internal +public class IcebergSourceSplitSerializer implements SimpleVersionedSerializer { + private static final int VERSION = 3; + + private final boolean caseSensitive; + + public IcebergSourceSplitSerializer(boolean caseSensitive) { + this.caseSensitive = caseSensitive; + } + + @Override + public int getVersion() { + return VERSION; + } + + @Override + public byte[] serialize(IcebergSourceSplit split) throws IOException { + return split.serializeV3(); + } + + @Override + public IcebergSourceSplit deserialize(int version, byte[] serialized) throws IOException { + switch (version) { + case 1: + return IcebergSourceSplit.deserializeV1(serialized); + case 2: + return IcebergSourceSplit.deserializeV2(serialized, caseSensitive); + case 3: + return IcebergSourceSplit.deserializeV3(serialized, caseSensitive); + default: + throw new IOException( + String.format( + Locale.ROOT, + "Failed to deserialize IcebergSourceSplit. " + + "Encountered unsupported version: %d. Supported version are [1]", + version)); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java new file mode 100644 index 000000000000..d9061e049e00 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.split; + +public class IcebergSourceSplitState { + private final IcebergSourceSplit split; + private final IcebergSourceSplitStatus status; + + public IcebergSourceSplitState(IcebergSourceSplit split, IcebergSourceSplitStatus status) { + this.split = split; + this.status = status; + } + + public IcebergSourceSplit split() { + return split; + } + + public IcebergSourceSplitStatus status() { + return status; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java new file mode 100644 index 000000000000..d4a84a165e1a --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.split; + +public enum IcebergSourceSplitStatus { + UNASSIGNED, + ASSIGNED, + COMPLETED +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java new file mode 100644 index 000000000000..319648ca275c --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.split; + +import java.io.Serializable; +import java.util.Comparator; + +public interface SerializableComparator extends Comparator, Serializable {} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java new file mode 100644 index 000000000000..841969666ee5 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.split; + +import java.io.IOException; +import java.io.Serializable; +import java.io.UTFDataFormatException; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; + +/** + * Helper class to serialize and deserialize strings longer than 65K. The inspiration is mostly + * taken from the class org.apache.flink.core.memory.DataInputSerializer.readUTF and + * org.apache.flink.core.memory.DataOutputSerializer.writeUTF. + */ +class SerializerHelper implements Serializable { + + private SerializerHelper() {} + + /** + * Similar to {@link DataOutputSerializer#writeUTF(String)}. Except this supports larger payloads + * which is up to max integer value. + * + *

    Note: This method can be removed when the method which does similar thing within the {@link + * DataOutputSerializer} already which does the same thing, so use that one instead once that is + * released on Flink version 1.20. + * + *

    See * FLINK-34228 * https://github.com/apache/flink/pull/24191 + * + * @param out the output stream to write the string to. + * @param str the string value to be written. + */ + public static void writeLongUTF(DataOutputSerializer out, String str) throws IOException { + int strlen = str.length(); + long utflen = 0; + int ch; + + /* use charAt instead of copying String to char array */ + for (int i = 0; i < strlen; i++) { + ch = str.charAt(i); + utflen += getUTFBytesSize(ch); + + if (utflen > Integer.MAX_VALUE) { + throw new UTFDataFormatException("Encoded string reached maximum length: " + utflen); + } + } + + if (utflen > Integer.MAX_VALUE - 4) { + throw new UTFDataFormatException("Encoded string is too long: " + utflen); + } + + out.writeInt((int) utflen); + writeUTFBytes(out, str, (int) utflen); + } + + /** + * Similar to {@link DataInputDeserializer#readUTF()}. Except this supports larger payloads which + * is up to max integer value. + * + *

    Note: This method can be removed when the method which does similar thing within the {@link + * DataOutputSerializer} already which does the same thing, so use that one instead once that is + * released on Flink version 1.20. + * + *

    See * FLINK-34228 * https://github.com/apache/flink/pull/24191 + * + * @param in the input stream to read the string from. + * @return the string value read from the input stream. + * @throws IOException if an I/O error occurs when reading from the input stream. + */ + public static String readLongUTF(DataInputDeserializer in) throws IOException { + int utflen = in.readInt(); + byte[] bytearr = new byte[utflen]; + char[] chararr = new char[utflen]; + + int ch; + int char2; + int char3; + int count = 0; + int chararrCount = 0; + + in.readFully(bytearr, 0, utflen); + + while (count < utflen) { + ch = (int) bytearr[count] & 0xff; + if (ch > 127) { + break; + } + count++; + chararr[chararrCount++] = (char) ch; + } + + while (count < utflen) { + ch = (int) bytearr[count] & 0xff; + switch (ch >> 4) { + case 0: + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + case 7: + /* 0xxxxxxx */ + count++; + chararr[chararrCount++] = (char) ch; + break; + case 12: + case 13: + /* 110x xxxx 10xx xxxx */ + count += 2; + if (count > utflen) { + throw new UTFDataFormatException("malformed input: partial character at end"); + } + char2 = bytearr[count - 1]; + if ((char2 & 0xC0) != 0x80) { + throw new UTFDataFormatException("malformed input around byte " + count); + } + chararr[chararrCount++] = (char) (((ch & 0x1F) << 6) | (char2 & 0x3F)); + break; + case 14: + /* 1110 xxxx 10xx xxxx 10xx xxxx */ + count += 3; + if (count > utflen) { + throw new UTFDataFormatException("malformed input: partial character at end"); + } + char2 = bytearr[count - 2]; + char3 = bytearr[count - 1]; + if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) { + throw new UTFDataFormatException("malformed input around byte " + (count - 1)); + } + chararr[chararrCount++] = + (char) (((ch & 0x0F) << 12) | ((char2 & 0x3F) << 6) | (char3 & 0x3F)); + break; + default: + /* 10xx xxxx, 1111 xxxx */ + throw new UTFDataFormatException("malformed input around byte " + count); + } + } + // The number of chars produced may be less than utflen + return new String(chararr, 0, chararrCount); + } + + private static int getUTFBytesSize(int ch) { + if ((ch >= 0x0001) && (ch <= 0x007F)) { + return 1; + } else if (ch > 0x07FF) { + return 3; + } else { + return 2; + } + } + + private static void writeUTFBytes(DataOutputSerializer out, String str, int utflen) + throws IOException { + int strlen = str.length(); + int ch; + + int len = Math.max(1024, utflen); + + byte[] bytearr = new byte[len]; + int count = 0; + + int index; + for (index = 0; index < strlen; index++) { + ch = str.charAt(index); + if (!((ch >= 0x0001) && (ch <= 0x007F))) { + break; + } + bytearr[count++] = (byte) ch; + } + + for (; index < strlen; index++) { + ch = str.charAt(index); + if ((ch >= 0x0001) && (ch <= 0x007F)) { + bytearr[count++] = (byte) ch; + } else if (ch > 0x07FF) { + bytearr[count++] = (byte) (0xE0 | ((ch >> 12) & 0x0F)); + bytearr[count++] = (byte) (0x80 | ((ch >> 6) & 0x3F)); + bytearr[count++] = (byte) (0x80 | (ch & 0x3F)); + } else { + bytearr[count++] = (byte) (0xC0 | ((ch >> 6) & 0x1F)); + bytearr[count++] = (byte) (0x80 | (ch & 0x3F)); + } + } + + out.write(bytearr, 0, count); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java new file mode 100644 index 000000000000..37bddfbb7182 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.split; + +import org.apache.iceberg.flink.FlinkReadOptions; +import org.apache.iceberg.flink.source.reader.SplitWatermarkExtractor; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * Provides implementations of {@link org.apache.iceberg.flink.source.split.SerializableComparator} + * which could be used for ordering splits. These are used by the {@link + * org.apache.iceberg.flink.source.assigner.OrderedSplitAssignerFactory} and the {@link + * org.apache.iceberg.flink.source.reader.IcebergSourceReader} + */ +public class SplitComparators { + private SplitComparators() {} + + /** Comparator which orders the splits based on the file sequence number of the data files */ + public static SerializableComparator fileSequenceNumber() { + return (IcebergSourceSplit o1, IcebergSourceSplit o2) -> { + Preconditions.checkArgument( + o1.task().files().size() == 1 && o2.task().files().size() == 1, + "Could not compare combined task. Please use '%s' to prevent combining multiple files to a split", + FlinkReadOptions.SPLIT_FILE_OPEN_COST); + + Long seq1 = o1.task().files().iterator().next().file().fileSequenceNumber(); + Long seq2 = o2.task().files().iterator().next().file().fileSequenceNumber(); + + Preconditions.checkNotNull( + seq1, + "Invalid file sequence number: null. Doesn't support splits written with V1 format: %s", + o1); + Preconditions.checkNotNull( + seq2, + "Invalid file sequence number: null. Doesn't support splits written with V1 format: %s", + o2); + + int temp = Long.compare(seq1, seq2); + if (temp != 0) { + return temp; + } else { + return o1.splitId().compareTo(o2.splitId()); + } + }; + } + + /** Comparator which orders the splits based on watermark of the splits */ + public static SerializableComparator watermark( + SplitWatermarkExtractor watermarkExtractor) { + return (IcebergSourceSplit o1, IcebergSourceSplit o2) -> { + long watermark1 = watermarkExtractor.extractWatermark(o1); + long watermark2 = watermarkExtractor.extractWatermark(o2); + + int temp = Long.compare(watermark1, watermark2); + if (temp != 0) { + return temp; + } else { + return o1.splitId().compareTo(o2.splitId()); + } + }; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java new file mode 100644 index 000000000000..eabd757aa638 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.split; + +import java.util.Collection; +import java.util.Collections; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.connector.source.SourceEvent; + +/** We can remove this class once FLINK-21364 is resolved. */ +@Internal +public class SplitRequestEvent implements SourceEvent { + private static final long serialVersionUID = 1L; + + private final Collection finishedSplitIds; + private final String requesterHostname; + + public SplitRequestEvent() { + this(Collections.emptyList()); + } + + public SplitRequestEvent(Collection finishedSplitIds) { + this(finishedSplitIds, null); + } + + public SplitRequestEvent(Collection finishedSplitIds, String requesterHostname) { + this.finishedSplitIds = finishedSplitIds; + this.requesterHostname = requesterHostname; + } + + public Collection finishedSplitIds() { + return finishedSplitIds; + } + + public String requesterHostname() { + return requesterHostname; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java new file mode 100644 index 000000000000..6306e82d5729 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.util; + +import java.util.concurrent.TimeUnit; +import org.apache.flink.annotation.Internal; +import org.apache.flink.metrics.Gauge; + +/** + * This gauge measures the elapsed time between now and last recorded time set by {@link + * ElapsedTimeGauge#refreshLastRecordedTime()}. + */ +@Internal +public class ElapsedTimeGauge implements Gauge { + private final TimeUnit reportUnit; + private volatile long lastRecordedTimeNano; + + public ElapsedTimeGauge(TimeUnit timeUnit) { + this.reportUnit = timeUnit; + refreshLastRecordedTime(); + } + + public void refreshLastRecordedTime() { + this.lastRecordedTimeNano = System.nanoTime(); + } + + @Override + public Long getValue() { + return reportUnit.convert(System.nanoTime() - lastRecordedTimeNano, TimeUnit.NANOSECONDS); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java new file mode 100644 index 000000000000..2bbc9cf208fe --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java @@ -0,0 +1,248 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.util; + +import java.util.List; +import java.util.Map; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.TableChange; +import org.apache.flink.table.catalog.UniqueConstraint; +import org.apache.iceberg.Table; +import org.apache.iceberg.Transaction; +import org.apache.iceberg.UpdateProperties; +import org.apache.iceberg.UpdateSchema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.types.Type; + +public class FlinkAlterTableUtil { + private FlinkAlterTableUtil() {} + + public static void commitChanges( + Table table, + String setLocation, + String setSnapshotId, + String pickSnapshotId, + Map setProperties) { + commitManageSnapshots(table, setSnapshotId, pickSnapshotId); + + Transaction transaction = table.newTransaction(); + + if (setLocation != null) { + transaction.updateLocation().setLocation(setLocation).commit(); + } + + if (!setProperties.isEmpty()) { + UpdateProperties updateProperties = transaction.updateProperties(); + setProperties.forEach( + (k, v) -> { + if (v == null) { + updateProperties.remove(k); + } else { + updateProperties.set(k, v); + } + }); + updateProperties.commit(); + } + + transaction.commitTransaction(); + } + + public static void commitChanges( + Table table, + String setLocation, + String setSnapshotId, + String pickSnapshotId, + List schemaChanges, + List propertyChanges) { + commitManageSnapshots(table, setSnapshotId, pickSnapshotId); + + Transaction transaction = table.newTransaction(); + + if (setLocation != null) { + transaction.updateLocation().setLocation(setLocation).commit(); + } + + if (!schemaChanges.isEmpty()) { + UpdateSchema updateSchema = transaction.updateSchema(); + FlinkAlterTableUtil.applySchemaChanges(updateSchema, schemaChanges); + updateSchema.commit(); + } + + if (!propertyChanges.isEmpty()) { + UpdateProperties updateProperties = transaction.updateProperties(); + FlinkAlterTableUtil.applyPropertyChanges(updateProperties, propertyChanges); + updateProperties.commit(); + } + + transaction.commitTransaction(); + } + + public static void commitManageSnapshots( + Table table, String setSnapshotId, String cherrypickSnapshotId) { + // don't allow setting the snapshot and picking a commit at the same time because order is + // ambiguous and choosing one order leads to different results + Preconditions.checkArgument( + setSnapshotId == null || cherrypickSnapshotId == null, + "Cannot set the current snapshot ID and cherry-pick snapshot changes"); + + if (setSnapshotId != null) { + long newSnapshotId = Long.parseLong(setSnapshotId); + table.manageSnapshots().setCurrentSnapshot(newSnapshotId).commit(); + } + + // if updating the table snapshot, perform that update first in case it fails + if (cherrypickSnapshotId != null) { + long newSnapshotId = Long.parseLong(cherrypickSnapshotId); + table.manageSnapshots().cherrypick(newSnapshotId).commit(); + } + } + + /** + * Applies a list of Flink table changes to an {@link UpdateSchema} operation. + * + * @param pendingUpdate an uncommitted UpdateSchema operation to configure + * @param schemaChanges a list of Flink table changes + */ + public static void applySchemaChanges( + UpdateSchema pendingUpdate, List schemaChanges) { + for (TableChange change : schemaChanges) { + if (change instanceof TableChange.AddColumn) { + TableChange.AddColumn addColumn = (TableChange.AddColumn) change; + Column flinkColumn = addColumn.getColumn(); + Preconditions.checkArgument( + FlinkCompatibilityUtil.isPhysicalColumn(flinkColumn), + "Unsupported table change: Adding computed column %s.", + flinkColumn.getName()); + Type icebergType = FlinkSchemaUtil.convert(flinkColumn.getDataType().getLogicalType()); + if (flinkColumn.getDataType().getLogicalType().isNullable()) { + pendingUpdate.addColumn( + flinkColumn.getName(), icebergType, flinkColumn.getComment().orElse(null)); + } else { + pendingUpdate.addRequiredColumn( + flinkColumn.getName(), icebergType, flinkColumn.getComment().orElse(null)); + } + } else if (change instanceof TableChange.ModifyColumn) { + TableChange.ModifyColumn modifyColumn = (TableChange.ModifyColumn) change; + applyModifyColumn(pendingUpdate, modifyColumn); + } else if (change instanceof TableChange.DropColumn) { + TableChange.DropColumn dropColumn = (TableChange.DropColumn) change; + pendingUpdate.deleteColumn(dropColumn.getColumnName()); + } else if (change instanceof TableChange.AddWatermark) { + throw new UnsupportedOperationException("Unsupported table change: AddWatermark."); + } else if (change instanceof TableChange.ModifyWatermark) { + throw new UnsupportedOperationException("Unsupported table change: ModifyWatermark."); + } else if (change instanceof TableChange.DropWatermark) { + throw new UnsupportedOperationException("Unsupported table change: DropWatermark."); + } else if (change instanceof TableChange.AddUniqueConstraint) { + TableChange.AddUniqueConstraint addPk = (TableChange.AddUniqueConstraint) change; + applyUniqueConstraint(pendingUpdate, addPk.getConstraint()); + } else if (change instanceof TableChange.ModifyUniqueConstraint) { + TableChange.ModifyUniqueConstraint modifyPk = (TableChange.ModifyUniqueConstraint) change; + applyUniqueConstraint(pendingUpdate, modifyPk.getNewConstraint()); + } else if (change instanceof TableChange.DropConstraint) { + throw new UnsupportedOperationException("Unsupported table change: DropConstraint."); + } else { + throw new UnsupportedOperationException("Cannot apply unknown table change: " + change); + } + } + } + + /** + * Applies a list of Flink table property changes to an {@link UpdateProperties} operation. + * + * @param pendingUpdate an uncommitted UpdateProperty operation to configure + * @param propertyChanges a list of Flink table changes + */ + public static void applyPropertyChanges( + UpdateProperties pendingUpdate, List propertyChanges) { + for (TableChange change : propertyChanges) { + if (change instanceof TableChange.SetOption) { + TableChange.SetOption setOption = (TableChange.SetOption) change; + pendingUpdate.set(setOption.getKey(), setOption.getValue()); + } else if (change instanceof TableChange.ResetOption) { + TableChange.ResetOption resetOption = (TableChange.ResetOption) change; + pendingUpdate.remove(resetOption.getKey()); + } else { + throw new UnsupportedOperationException( + "The given table change is not a property change: " + change); + } + } + } + + private static void applyModifyColumn( + UpdateSchema pendingUpdate, TableChange.ModifyColumn modifyColumn) { + if (modifyColumn instanceof TableChange.ModifyColumnName) { + TableChange.ModifyColumnName modifyName = (TableChange.ModifyColumnName) modifyColumn; + pendingUpdate.renameColumn(modifyName.getOldColumnName(), modifyName.getNewColumnName()); + } else if (modifyColumn instanceof TableChange.ModifyColumnPosition) { + TableChange.ModifyColumnPosition modifyPosition = + (TableChange.ModifyColumnPosition) modifyColumn; + applyModifyColumnPosition(pendingUpdate, modifyPosition); + } else if (modifyColumn instanceof TableChange.ModifyPhysicalColumnType) { + TableChange.ModifyPhysicalColumnType modifyType = + (TableChange.ModifyPhysicalColumnType) modifyColumn; + Type type = FlinkSchemaUtil.convert(modifyType.getNewType().getLogicalType()); + String columnName = modifyType.getOldColumn().getName(); + pendingUpdate.updateColumn(columnName, type.asPrimitiveType()); + if (modifyType.getNewColumn().getDataType().getLogicalType().isNullable()) { + pendingUpdate.makeColumnOptional(columnName); + } else { + pendingUpdate.requireColumn(columnName); + } + } else if (modifyColumn instanceof TableChange.ModifyColumnComment) { + TableChange.ModifyColumnComment modifyComment = + (TableChange.ModifyColumnComment) modifyColumn; + pendingUpdate.updateColumnDoc( + modifyComment.getOldColumn().getName(), modifyComment.getNewComment()); + } else { + throw new UnsupportedOperationException( + "Cannot apply unknown modify-column change: " + modifyColumn); + } + } + + private static void applyModifyColumnPosition( + UpdateSchema pendingUpdate, TableChange.ModifyColumnPosition modifyColumnPosition) { + TableChange.ColumnPosition newPosition = modifyColumnPosition.getNewPosition(); + if (newPosition instanceof TableChange.First) { + pendingUpdate.moveFirst(modifyColumnPosition.getOldColumn().getName()); + } else if (newPosition instanceof TableChange.After) { + TableChange.After after = (TableChange.After) newPosition; + pendingUpdate.moveAfter(modifyColumnPosition.getOldColumn().getName(), after.column()); + } else { + throw new UnsupportedOperationException( + "Cannot apply unknown modify-column-position change: " + modifyColumnPosition); + } + } + + private static void applyUniqueConstraint( + UpdateSchema pendingUpdate, UniqueConstraint constraint) { + switch (constraint.getType()) { + case PRIMARY_KEY: + pendingUpdate.setIdentifierFields(constraint.getColumns()); + break; + case UNIQUE_KEY: + throw new UnsupportedOperationException( + "Unsupported table change: setting unique key constraints."); + default: + throw new UnsupportedOperationException( + "Cannot apply unknown unique constraint: " + constraint.getType().name()); + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java new file mode 100644 index 000000000000..38bd73b87127 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.util; + +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.table.api.Schema; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.legacy.api.TableColumn; +import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; +import org.apache.flink.table.types.logical.RowType; + +/** + * This is a small util class that try to hide calls to Flink Internal or PublicEvolve interfaces as + * Flink can change those APIs during minor version release. + */ +public class FlinkCompatibilityUtil { + + private FlinkCompatibilityUtil() {} + + public static TypeInformation toTypeInfo(RowType rowType) { + return InternalTypeInfo.of(rowType); + } + + /** + * @deprecated since 1.10.0, will be removed in 2.0.0. + */ + @Deprecated + public static boolean isPhysicalColumn(TableColumn column) { + return column.isPhysical(); + } + + public static boolean isPhysicalColumn(Column column) { + return column.isPhysical(); + } + + public static boolean isPhysicalColumn(Schema.UnresolvedColumn column) { + return column instanceof Schema.UnresolvedPhysicalColumn; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java new file mode 100644 index 000000000000..20b33e615e5f --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.util; + +import java.util.concurrent.atomic.AtomicReference; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; + +public class FlinkPackage { + + private static final AtomicReference VERSION = new AtomicReference<>(); + public static final String FLINK_UNKNOWN_VERSION = "FLINK-UNKNOWN-VERSION"; + + private FlinkPackage() {} + + /** Returns Flink version string like x.y.z */ + public static String version() { + if (null == VERSION.get()) { + String detectedVersion; + try { + detectedVersion = versionFromJar(); + // use unknown version in case exact implementation version can't be found from the jar + // (this can happen if the DataStream class appears multiple times in the same classpath + // such as with shading) + detectedVersion = detectedVersion != null ? detectedVersion : FLINK_UNKNOWN_VERSION; + } catch (Exception e) { + detectedVersion = FLINK_UNKNOWN_VERSION; + } + VERSION.set(detectedVersion); + } + + return VERSION.get(); + } + + @VisibleForTesting + static String versionFromJar() { + // Choose {@link DataStream} class because it is one of the core Flink API + return DataStream.class.getPackage().getImplementationVersion(); + } + + @VisibleForTesting + static void setVersion(String version) { + VERSION.set(version); + } +} diff --git a/flink/v2.0/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory b/flink/v2.0/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory new file mode 100644 index 000000000000..3034b2d3754a --- /dev/null +++ b/flink/v2.0/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.iceberg.flink.FlinkCatalogFactory +org.apache.iceberg.flink.FlinkDynamicTableFactory diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java new file mode 100644 index 000000000000..4184526a6a1a --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import org.junit.jupiter.api.Test; + +public abstract class AvroGenericRecordConverterBase { + protected abstract void testConverter(DataGenerator dataGenerator) throws Exception; + + @Test + public void testPrimitiveTypes() throws Exception { + testConverter(new DataGenerators.Primitives()); + } + + @Test + public void testStructOfPrimitive() throws Exception { + testConverter(new DataGenerators.StructOfPrimitive()); + } + + @Test + public void testStructOfArray() throws Exception { + testConverter(new DataGenerators.StructOfArray()); + } + + @Test + public void testStructOfMap() throws Exception { + testConverter(new DataGenerators.StructOfMap()); + } + + @Test + public void testStructOfStruct() throws Exception { + testConverter(new DataGenerators.StructOfStruct()); + } + + @Test + public void testArrayOfPrimitive() throws Exception { + testConverter(new DataGenerators.ArrayOfPrimitive()); + } + + @Test + public void testArrayOfArray() throws Exception { + testConverter(new DataGenerators.ArrayOfArray()); + } + + @Test + public void testArrayOfMap() throws Exception { + testConverter(new DataGenerators.ArrayOfMap()); + } + + @Test + public void testArrayOfStruct() throws Exception { + testConverter(new DataGenerators.ArrayOfStruct()); + } + + @Test + public void testMapOfPrimitives() throws Exception { + testConverter(new DataGenerators.MapOfPrimitives()); + } + + @Test + public void testMapOfArray() throws Exception { + testConverter(new DataGenerators.MapOfArray()); + } + + @Test + public void testMapOfMap() throws Exception { + testConverter(new DataGenerators.MapOfMap()); + } + + @Test + public void testMapOfStruct() throws Exception { + testConverter(new DataGenerators.MapOfStruct()); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java new file mode 100644 index 000000000000..062ff68d5d85 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.io.File; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import org.apache.flink.util.ArrayUtils; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.SupportsNamespaces; +import org.apache.iceberg.hadoop.HadoopCatalog; +import org.apache.iceberg.relocated.com.google.common.base.Joiner; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public abstract class CatalogTestBase extends TestBase { + + protected static final String DATABASE = "db"; + @TempDir protected File hiveWarehouse; + @TempDir protected File hadoopWarehouse; + + @Parameter(index = 0) + protected String catalogName; + + @Parameter(index = 1) + protected Namespace baseNamespace; + + protected Catalog validationCatalog; + protected SupportsNamespaces validationNamespaceCatalog; + protected Map config = Maps.newHashMap(); + + protected String flinkDatabase; + protected Namespace icebergNamespace; + protected boolean isHadoopCatalog; + + @Parameters(name = "catalogName={0}, baseNamespace={1}") + protected static List parameters() { + return Arrays.asList( + new Object[] {"testhive", Namespace.empty()}, + new Object[] {"testhadoop", Namespace.empty()}, + new Object[] {"testhadoop_basenamespace", Namespace.of("l0", "l1")}); + } + + @BeforeEach + public void before() { + this.isHadoopCatalog = catalogName.startsWith("testhadoop"); + this.validationCatalog = + isHadoopCatalog + ? new HadoopCatalog(hiveConf, "file:" + hadoopWarehouse.getPath()) + : catalog; + this.validationNamespaceCatalog = (SupportsNamespaces) validationCatalog; + + config.put("type", "iceberg"); + if (!baseNamespace.isEmpty()) { + config.put(FlinkCatalogFactory.BASE_NAMESPACE, baseNamespace.toString()); + } + if (isHadoopCatalog) { + config.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hadoop"); + } else { + config.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hive"); + config.put(CatalogProperties.URI, getURI(hiveConf)); + } + config.put(CatalogProperties.WAREHOUSE_LOCATION, String.format("file://%s", warehouseRoot())); + + this.flinkDatabase = catalogName + "." + DATABASE; + this.icebergNamespace = + Namespace.of(ArrayUtils.concat(baseNamespace.levels(), new String[] {DATABASE})); + sql("CREATE CATALOG %s WITH %s", catalogName, toWithClause(config)); + } + + @AfterEach + public void clean() { + dropCatalog(catalogName, true); + } + + protected String warehouseRoot() { + if (isHadoopCatalog) { + return hadoopWarehouse.getAbsolutePath(); + } else { + return hiveWarehouse.getAbsolutePath(); + } + } + + protected String getFullQualifiedTableName(String tableName) { + final List levels = Lists.newArrayList(icebergNamespace.levels()); + levels.add(tableName); + return Joiner.on('.').join(levels); + } + + static String getURI(HiveConf conf) { + return conf.get(HiveConf.ConfVars.METASTOREURIS.varname); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java new file mode 100644 index 000000000000..b1e3b20ff7ac --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.GenericRecord; + +/** + * This interface defines test data generator. Different implementations for primitive and complex + * nested fields are defined in {@link DataGenerators}. + */ +public interface DataGenerator { + Schema icebergSchema(); + + RowType flinkRowType(); + + org.apache.avro.Schema avroSchema(); + + GenericRecord generateIcebergGenericRecord(); + + GenericRowData generateFlinkRowData(); + + org.apache.avro.generic.GenericRecord generateAvroGenericRecord(); +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java new file mode 100644 index 000000000000..e2cd411d7069 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java @@ -0,0 +1,1172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.types.Types.NestedField.required; + +import com.fasterxml.jackson.databind.node.IntNode; +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.OffsetDateTime; +import java.time.ZoneOffset; +import java.util.Arrays; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; +import org.apache.avro.LogicalTypes; +import org.apache.avro.SchemaBuilder; +import org.apache.avro.generic.GenericData; +import org.apache.avro.util.Utf8; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Types; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.joda.time.Days; + +/** + * Util class to generate test data with extensive coverage different field types: from primitives + * to complex nested types. + */ +public class DataGenerators { + + public static class Primitives implements DataGenerator { + private static final DateTime JODA_DATETIME_EPOC = + new DateTime(1970, 1, 1, 0, 0, 0, 0, DateTimeZone.UTC); + private static final DateTime JODA_DATETIME_20220110 = + new DateTime(2022, 1, 10, 0, 0, 0, 0, DateTimeZone.UTC); + private static final int DAYS_BTW_EPOC_AND_20220110 = + Days.daysBetween(JODA_DATETIME_EPOC, JODA_DATETIME_20220110).getDays(); + private static final int HOUR_8_IN_MILLI = (int) TimeUnit.HOURS.toMillis(8); + + private static final LocalDate JAVA_LOCAL_DATE_20220110 = LocalDate.of(2022, 1, 10); + private static final LocalTime JAVA_LOCAL_TIME_HOUR8 = LocalTime.of(8, 0); + private static final OffsetDateTime JAVA_OFFSET_DATE_TIME_20220110 = + OffsetDateTime.of(2022, 1, 10, 0, 0, 0, 0, ZoneOffset.UTC); + private static final LocalDateTime JAVA_LOCAL_DATE_TIME_20220110 = + LocalDateTime.of(2022, 1, 10, 0, 0, 0); + private static final BigDecimal BIG_DECIMAL_NEGATIVE = new BigDecimal("-1.50"); + private static final byte[] FIXED_BYTES = "012345689012345".getBytes(StandardCharsets.UTF_8); + + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + // primitive types + Types.NestedField.optional(2, "boolean_field", Types.BooleanType.get()), + Types.NestedField.optional(3, "int_field", Types.IntegerType.get()), + Types.NestedField.optional(4, "long_field", Types.LongType.get()), + Types.NestedField.optional(5, "float_field", Types.FloatType.get()), + Types.NestedField.optional(6, "double_field", Types.DoubleType.get()), + Types.NestedField.required(7, "string_field", Types.StringType.get()), + Types.NestedField.required(8, "date_field", Types.DateType.get()), + Types.NestedField.required(9, "time_field", Types.TimeType.get()), + Types.NestedField.required(10, "ts_with_zone_field", Types.TimestampType.withZone()), + Types.NestedField.required( + 11, "ts_without_zone_field", Types.TimestampType.withoutZone()), + Types.NestedField.required(12, "uuid_field", Types.UUIDType.get()), + Types.NestedField.required(13, "binary_field", Types.BinaryType.get()), + Types.NestedField.required(14, "decimal_field", Types.DecimalType.of(9, 2)), + Types.NestedField.required(15, "fixed_field", Types.FixedType.ofLength(16))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + /** + * Fix up Avro Schema that is converted from Iceberg Schema. + * + * @param schemaConvertedFromIceberg Avro Schema converted from Iceberg schema via {@link + * AvroSchemaUtil#convert(Schema, String)} + */ + private org.apache.avro.Schema fixupAvroSchemaConvertedFromIcebergSchema( + org.apache.avro.Schema schemaConvertedFromIceberg) { + List fixedFields = + schemaConvertedFromIceberg.getFields().stream() + .map( + field -> { + org.apache.avro.Schema.Field updatedField = field; + if (field.name().equals("time_field")) { + // Iceberg's AvroSchemaUtil uses timestamp-micros with Long value for time + // field, while AvroToRowDataConverters#convertToTime() always looks for + // Integer value assuming millis. The root problem is that + // AvroToRowDataConverters#createConverter() uses LogicalTypeRoot to + // determine converter and LogicalTypeRoot lost the timestamp precision + // carried by LogicalType like Time(6). + org.apache.avro.Schema fieldSchema = + LogicalTypes.timeMillis() + .addToSchema( + org.apache.avro.Schema.create(org.apache.avro.Schema.Type.INT)); + updatedField = new org.apache.avro.Schema.Field("time_field", fieldSchema); + } + + return new org.apache.avro.Schema.Field(updatedField, updatedField.schema()); + }) + .collect(Collectors.toList()); + return org.apache.avro.Schema.createRecord( + schemaConvertedFromIceberg.getName(), + schemaConvertedFromIceberg.getDoc(), + schemaConvertedFromIceberg.getNamespace(), + schemaConvertedFromIceberg.isError(), + fixedFields); + } + + private final org.apache.avro.Schema avroSchema = + fixupAvroSchemaConvertedFromIcebergSchema(AvroSchemaUtil.convert(icebergSchema, "table")); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField("boolean_field", false); + genericRecord.setField("int_field", Integer.MAX_VALUE); + genericRecord.setField("long_field", Long.MAX_VALUE); + genericRecord.setField("float_field", Float.MAX_VALUE); + genericRecord.setField("double_field", Double.MAX_VALUE); + genericRecord.setField("string_field", "str"); + + genericRecord.setField("date_field", JAVA_LOCAL_DATE_20220110); + genericRecord.setField("time_field", JAVA_LOCAL_TIME_HOUR8); + genericRecord.setField("ts_with_zone_field", JAVA_OFFSET_DATE_TIME_20220110); + genericRecord.setField("ts_without_zone_field", JAVA_LOCAL_DATE_TIME_20220110); + + byte[] uuidBytes = new byte[16]; + for (int i = 0; i < 16; ++i) { + uuidBytes[i] = (byte) i; + } + + genericRecord.setField("uuid_field", UUID.nameUUIDFromBytes(uuidBytes)); + + byte[] binaryBytes = new byte[7]; + for (int i = 0; i < 7; ++i) { + binaryBytes[i] = (byte) i; + } + genericRecord.setField("binary_field", ByteBuffer.wrap(binaryBytes)); + + genericRecord.setField("decimal_field", BIG_DECIMAL_NEGATIVE); + genericRecord.setField("fixed_field", FIXED_BYTES); + + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + byte[] uuidBytes = new byte[16]; + for (int i = 0; i < 16; ++i) { + uuidBytes[i] = (byte) i; + } + + byte[] binaryBytes = new byte[7]; + for (int i = 0; i < 7; ++i) { + binaryBytes[i] = (byte) i; + } + + return GenericRowData.of( + StringData.fromString("row_id_value"), + false, + Integer.MAX_VALUE, + Long.MAX_VALUE, + Float.MAX_VALUE, + Double.MAX_VALUE, + StringData.fromString("str"), + DAYS_BTW_EPOC_AND_20220110, + HOUR_8_IN_MILLI, + // Although Avro logical type for timestamp fields are in micro seconds, + // AvroToRowDataConverters only looks for long value in milliseconds. + TimestampData.fromEpochMillis(JODA_DATETIME_20220110.getMillis()), + TimestampData.fromEpochMillis(JODA_DATETIME_20220110.getMillis()), + uuidBytes, + binaryBytes, + DecimalData.fromBigDecimal(BIG_DECIMAL_NEGATIVE, 9, 2), + FIXED_BYTES); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", new Utf8("row_id_value")); + genericRecord.put("boolean_field", false); + genericRecord.put("int_field", Integer.MAX_VALUE); + genericRecord.put("long_field", Long.MAX_VALUE); + genericRecord.put("float_field", Float.MAX_VALUE); + genericRecord.put("double_field", Double.MAX_VALUE); + genericRecord.put("string_field", new Utf8("str")); + + genericRecord.put("date_field", DAYS_BTW_EPOC_AND_20220110); + genericRecord.put("time_field", HOUR_8_IN_MILLI); + // Although Avro logical type for timestamp fields are in micro seconds, + // AvroToRowDataConverters only looks for long value in milliseconds. + genericRecord.put("ts_with_zone_field", JODA_DATETIME_20220110.getMillis()); + genericRecord.put("ts_without_zone_field", JODA_DATETIME_20220110.getMillis()); + + byte[] uuidBytes = new byte[16]; + for (int i = 0; i < 16; ++i) { + uuidBytes[i] = (byte) i; + } + genericRecord.put("uuid_field", ByteBuffer.wrap(uuidBytes)); + + byte[] binaryBytes = new byte[7]; + for (int i = 0; i < 7; ++i) { + binaryBytes[i] = (byte) i; + } + genericRecord.put("binary_field", ByteBuffer.wrap(binaryBytes)); + + BigDecimal bigDecimal = new BigDecimal("-1.50"); + // unscaledValue().toByteArray() is to match the behavior of RowDataToAvroConverters from + // Flink for decimal type + genericRecord.put("decimal_field", ByteBuffer.wrap(bigDecimal.unscaledValue().toByteArray())); + + genericRecord.put("fixed_field", ByteBuffer.wrap(FIXED_BYTES)); + + return genericRecord; + } + } + + public static class StructOfPrimitive implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, + "struct_of_primitive", + Types.StructType.of( + required(101, "id", Types.IntegerType.get()), + required(102, "name", Types.StringType.get())))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + Schema structSchema = + new Schema(icebergSchema.findField("struct_of_primitive").type().asStructType().fields()); + GenericRecord struct = GenericRecord.create(structSchema); + struct.setField("id", 1); + struct.setField("name", "Jane"); + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField("struct_of_primitive", struct); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + return GenericRowData.of( + StringData.fromString("row_id_value"), + GenericRowData.of(1, StringData.fromString("Jane"))); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.Schema structSchema = avroSchema.getField("struct_of_primitive").schema(); + org.apache.avro.generic.GenericRecord struct = new GenericData.Record(structSchema); + struct.put("id", 1); + struct.put("name", "Jane"); + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put("struct_of_primitive", struct); + return genericRecord; + } + } + + public static class StructOfArray implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, + "struct_of_array", + Types.StructType.of( + required(101, "id", Types.IntegerType.get()), + required( + 102, "names", Types.ListType.ofRequired(201, Types.StringType.get()))))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + Schema structSchema = + new Schema(icebergSchema.findField("struct_of_array").type().asStructType().fields()); + GenericRecord struct = GenericRecord.create(structSchema); + struct.setField("id", 1); + struct.setField("names", Arrays.asList("Jane", "Joe")); + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField("struct_of_array", struct); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + StringData[] names = {StringData.fromString("Jane"), StringData.fromString("Joe")}; + return GenericRowData.of( + StringData.fromString("row_id_value"), GenericRowData.of(1, new GenericArrayData(names))); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.Schema structSchema = avroSchema.getField("struct_of_array").schema(); + org.apache.avro.generic.GenericRecord struct = new GenericData.Record(structSchema); + struct.put("id", 1); + struct.put("names", Arrays.asList("Jane", "Joe")); + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put("struct_of_array", struct); + return genericRecord; + } + } + + public static class StructOfMap implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, + "struct_of_map", + Types.StructType.of( + required(101, "id", Types.IntegerType.get()), + required( + 102, + "names", + Types.MapType.ofRequired( + 201, 202, Types.StringType.get(), Types.StringType.get()))))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + Schema structSchema = + new Schema(icebergSchema.findField("struct_of_map").type().asStructType().fields()); + GenericRecord struct = GenericRecord.create(structSchema); + struct.setField("id", 1); + struct.setField("names", ImmutableMap.of("Jane", "female", "Joe", "male")); + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField("struct_of_map", struct); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + return GenericRowData.of( + StringData.fromString("row_id_value"), + GenericRowData.of( + 1, + new GenericMapData( + ImmutableMap.of( + StringData.fromString("Jane"), + StringData.fromString("female"), + StringData.fromString("Joe"), + StringData.fromString("male"))))); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.Schema structSchema = avroSchema.getField("struct_of_map").schema(); + org.apache.avro.generic.GenericRecord struct = new GenericData.Record(structSchema); + struct.put("id", 1); + struct.put("names", ImmutableMap.of("Jane", new Utf8("female"), "Joe", new Utf8("male"))); + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put("struct_of_map", struct); + return genericRecord; + } + } + + public static class StructOfStruct implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, + "struct_of_struct", + Types.StructType.of( + required(101, "id", Types.IntegerType.get()), + required( + 102, + "person_struct", + Types.StructType.of( + Types.NestedField.required(201, "name", Types.StringType.get()), + Types.NestedField.required(202, "address", Types.StringType.get())))))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + Schema structSchema = + new Schema(icebergSchema.findField("struct_of_struct").type().asStructType().fields()); + Schema personSchema = + new Schema(structSchema.findField("person_struct").type().asStructType().fields()); + GenericRecord person = GenericRecord.create(personSchema); + person.setField("name", "Jane"); + person.setField("address", "Apple Park"); + GenericRecord struct = GenericRecord.create(structSchema); + struct.setField("id", 1); + struct.setField("person_struct", person); + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField("struct_of_struct", struct); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + return GenericRowData.of( + StringData.fromString("row_id_value"), + GenericRowData.of( + 1, + GenericRowData.of( + StringData.fromString("Jane"), StringData.fromString("Apple Park")))); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.Schema structSchema = avroSchema.getField("struct_of_struct").schema(); + org.apache.avro.Schema personSchema = structSchema.getField("person_struct").schema(); + org.apache.avro.generic.GenericRecord person = new GenericData.Record(personSchema); + person.put("name", "Jane"); + person.put("address", "Apple Park"); + org.apache.avro.generic.GenericRecord struct = new GenericData.Record(structSchema); + struct.put("id", 1); + struct.put("person_struct", person); + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put("struct_of_struct", struct); + return genericRecord; + } + } + + public static class ArrayOfPrimitive implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, "array_of_int", Types.ListType.ofOptional(101, Types.IntegerType.get()))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField("array_of_int", Arrays.asList(1, 2, 3)); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + Integer[] arr = {1, 2, 3}; + return GenericRowData.of(StringData.fromString("row_id_value"), new GenericArrayData(arr)); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put("array_of_int", Arrays.asList(1, 2, 3)); + return genericRecord; + } + } + + public static class ArrayOfArray implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, + "array_of_array", + Types.ListType.ofRequired( + 101, Types.ListType.ofRequired(201, Types.IntegerType.get())))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField( + "array_of_array", Arrays.asList(Arrays.asList(1, 2, 3), Arrays.asList(4, 5, 6))); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + // non-primitive + Integer[] array1 = {1, 2, 3}; + Integer[] array2 = {4, 5, 6}; + GenericArrayData[] arrayOfArrays = { + new GenericArrayData(array1), new GenericArrayData(array2) + }; + return GenericRowData.of( + StringData.fromString("row_id_value"), new GenericArrayData(arrayOfArrays)); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put( + "array_of_array", Arrays.asList(Arrays.asList(1, 2, 3), Arrays.asList(4, 5, 6))); + return genericRecord; + } + } + + public static class ArrayOfMap implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, + "array_of_map", + Types.ListType.ofRequired( + 101, + Types.MapType.ofRequired( + 201, 202, Types.StringType.get(), Types.IntegerType.get())))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField( + "array_of_map", + Arrays.asList( + ImmutableMap.of("Jane", 1, "Joe", 2), ImmutableMap.of("Alice", 3, "Bob", 4))); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + GenericMapData[] array = { + new GenericMapData( + ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2)), + new GenericMapData( + ImmutableMap.of(StringData.fromString("Alice"), 3, StringData.fromString("Bob"), 4)) + }; + return GenericRowData.of(StringData.fromString("row_id_value"), new GenericArrayData(array)); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put( + "array_of_map", + Arrays.asList( + ImmutableMap.of("Jane", 1, "Joe", 2), ImmutableMap.of("Alice", 3, "Bob", 4))); + return genericRecord; + } + } + + public static class ArrayOfStruct implements DataGenerator { + private final Types.StructType structType = + Types.StructType.of( + required(201, "id", Types.IntegerType.get()), + required(202, "name", Types.StringType.get())); + private final Schema structIcebergSchema = new Schema(structType.fields()); + private final org.apache.avro.Schema structAvroSchema = + AvroSchemaUtil.convert(structIcebergSchema, "struct"); + + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.optional( + 2, "array_of_struct", Types.ListType.ofRequired(101, structType))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + GenericRecord struct1 = GenericRecord.create(structIcebergSchema); + struct1.setField("id", 1); + struct1.setField("name", "Jane"); + GenericRecord struct2 = GenericRecord.create(structIcebergSchema); + struct2.setField("id", 2); + struct2.setField("name", "Joe"); + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField("array_of_struct", Arrays.asList(struct1, struct2)); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + GenericRowData[] structArray = { + GenericRowData.of(1, StringData.fromString("Jane")), + GenericRowData.of(2, StringData.fromString("Joe")) + }; + return GenericRowData.of( + StringData.fromString("row_id_value"), new GenericArrayData(structArray)); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.generic.GenericRecord struct1 = new GenericData.Record(structAvroSchema); + struct1.put("id", 1); + struct1.put("name", "Jane"); + org.apache.avro.generic.GenericRecord struct2 = new GenericData.Record(structAvroSchema); + struct2.put("id", 2); + struct2.put("name", "Joe"); + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put("array_of_struct", Arrays.asList(struct1, struct2)); + return genericRecord; + } + } + + public static class MapOfPrimitives implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.optional( + 2, + "map_of_primitives", + Types.MapType.ofRequired( + 101, 102, Types.StringType.get(), Types.IntegerType.get()))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + return GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericMapData( + ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2))); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); + return genericRecord; + } + } + + public static class MapOfArray implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, + "map_of_array", + Types.MapType.ofRequired( + 101, + 102, + Types.StringType.get(), + Types.ListType.ofRequired(201, Types.IntegerType.get())))); + + private final RowType rowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return rowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField( + "map_of_array", + ImmutableMap.of( + "Jane", Arrays.asList(1, 2, 3), + "Joe", Arrays.asList(4, 5, 6))); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + Integer[] janeArray = {1, 2, 3}; + Integer[] joeArray = {4, 5, 6}; + return GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericMapData( + ImmutableMap.of( + StringData.fromString("Jane"), + new GenericArrayData(janeArray), + StringData.fromString("Joe"), + new GenericArrayData(joeArray)))); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put( + "map_of_array", + ImmutableMap.of( + "Jane", Arrays.asList(1, 2, 3), + "Joe", Arrays.asList(4, 5, 6))); + return genericRecord; + } + } + + public static class MapOfMap implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, + "map_of_map", + Types.MapType.ofRequired( + 101, + 102, + Types.StringType.get(), + Types.MapType.ofRequired( + 301, 302, Types.StringType.get(), Types.IntegerType.get())))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField( + "map_of_map", + ImmutableMap.of( + "female", ImmutableMap.of("Jane", 1, "Alice", 2), + "male", ImmutableMap.of("Joe", 3, "Bob", 4))); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + return GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericMapData( + ImmutableMap.of( + StringData.fromString("female"), + new GenericMapData( + ImmutableMap.of( + StringData.fromString("Jane"), 1, StringData.fromString("Alice"), 2)), + StringData.fromString("male"), + new GenericMapData( + ImmutableMap.of( + StringData.fromString("Joe"), 3, StringData.fromString("Bob"), 4))))); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put( + "map_of_map", + ImmutableMap.of( + "female", ImmutableMap.of("Jane", 1, "Alice", 2), + "male", ImmutableMap.of("Joe", 3, "Bob", 4))); + return genericRecord; + } + } + + public static class MapOfStruct implements DataGenerator { + private org.apache.avro.Schema createAvroSchemaIdField() { + org.apache.avro.Schema schema = SchemaBuilder.builder().intType(); + // this is needed to match the converter generated schema props + schema.addProp("field-id", IntNode.valueOf(201)); + return schema; + } + + private org.apache.avro.Schema createAvroSchemaNameField() { + org.apache.avro.Schema schema = SchemaBuilder.builder().stringType(); + // this is needed to match the converter generated schema props + schema.addProp("field-id", IntNode.valueOf(202)); + return schema; + } + + private final Types.StructType structType = + Types.StructType.of( + required(201, "id", Types.IntegerType.get()), + required(202, "name", Types.StringType.get())); + private final Schema structIcebergSchema = new Schema(structType.fields()); + + private final org.apache.avro.Schema structAvroSchema = + SchemaBuilder.builder() + .record("struct") + .fields() + .name("id") + .type(createAvroSchemaIdField()) + .noDefault() + .name("name") + .type(createAvroSchemaNameField()) + .noDefault() + .endRecord(); + + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, + "map_of_struct", + Types.MapType.ofRequired(101, 102, Types.StringType.get(), structType))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + // Can't use AvroSchemaUtil.convert otherwise the nested schema will have generated name like + // `r102` not the specified name like `struct`. + org.apache.avro.Schema avroSchema = + SchemaBuilder.builder() + .record("table") + .fields() + .requiredString("row_id") + .name("map_of_struct") + .type(SchemaBuilder.builder().map().values(structAvroSchema)) + .noDefault() + .endRecord(); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + GenericRecord struct1 = GenericRecord.create(structIcebergSchema); + struct1.setField("id", 1); + struct1.setField("name", "Jane"); + GenericRecord struct2 = GenericRecord.create(structIcebergSchema); + struct2.setField("id", 2); + struct2.setField("name", "Joe"); + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField( + "map_of_struct", ImmutableMap.of("struct1", struct1, "struct2", struct2)); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + return GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericMapData( + ImmutableMap.of( + StringData.fromString("struct1"), + GenericRowData.of(1, StringData.fromString("Jane")), + StringData.fromString("struct2"), + GenericRowData.of(2, StringData.fromString("Joe"))))); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.generic.GenericRecord struct1 = new GenericData.Record(structAvroSchema); + struct1.put("id", 1); + struct1.put("name", new Utf8("Jane")); + org.apache.avro.generic.GenericRecord struct2 = new GenericData.Record(structAvroSchema); + struct2.put("id", 2); + struct2.put("name", new Utf8("Joe")); + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", new Utf8("row_id_value")); + genericRecord.put("map_of_struct", ImmutableMap.of("struct1", struct1, "struct2", struct2)); + return genericRecord; + } + } + + public static class MapOfStructStruct implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.optional( + 2, + "map", + Types.MapType.ofOptional( + 101, + 102, + Types.StructType.of( + Types.NestedField.required(201, "key", Types.LongType.get()), + Types.NestedField.optional(202, "keyData", Types.StringType.get())), + Types.StructType.of( + Types.NestedField.required(203, "value", Types.LongType.get()), + Types.NestedField.optional(204, "valueData", Types.StringType.get()))))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + throw new UnsupportedOperationException( + "Not applicable as Avro Map only support string key type"); + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + throw new UnsupportedOperationException("Not implemented yet"); + } + + @Override + public GenericRowData generateFlinkRowData() { + return GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericMapData( + ImmutableMap.of( + GenericRowData.of(1L, StringData.fromString("key_data")), + GenericRowData.of(1L, StringData.fromString("value_data"))))); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + throw new UnsupportedOperationException("Avro Map only support string key type"); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java new file mode 100644 index 000000000000..fd5c6b76b683 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.UUID; +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.hadoop.HadoopCatalog; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.extension.AfterAllCallback; +import org.junit.jupiter.api.extension.AfterEachCallback; +import org.junit.jupiter.api.extension.BeforeAllCallback; +import org.junit.jupiter.api.extension.BeforeEachCallback; +import org.junit.jupiter.api.extension.ExtensionContext; + +public class HadoopCatalogExtension + implements BeforeAllCallback, BeforeEachCallback, AfterAllCallback, AfterEachCallback { + protected final String database; + protected final String tableName; + + protected Path temporaryFolder; + protected Catalog catalog; + protected CatalogLoader catalogLoader; + protected String warehouse; + protected TableLoader tableLoader; + + public HadoopCatalogExtension(String database, String tableName) { + this.database = database; + this.tableName = tableName; + } + + @Override + public void beforeAll(ExtensionContext context) throws Exception { + this.temporaryFolder = Files.createTempDirectory("junit5_hadoop_catalog-"); + } + + @Override + public void afterAll(ExtensionContext context) throws Exception { + FileUtils.deleteDirectory(temporaryFolder.toFile()); + } + + @Override + public void beforeEach(ExtensionContext context) throws Exception { + assertThat(temporaryFolder).exists().isDirectory(); + this.warehouse = "file:" + temporaryFolder + "/" + UUID.randomUUID(); + this.catalogLoader = + CatalogLoader.hadoop( + "hadoop", + new Configuration(), + ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouse)); + this.catalog = catalogLoader.loadCatalog(); + this.tableLoader = + TableLoader.fromCatalog(catalogLoader, TableIdentifier.of(database, tableName)); + } + + @Override + public void afterEach(ExtensionContext context) throws Exception { + try { + catalog.dropTable(TableIdentifier.of(database, tableName)); + ((HadoopCatalog) catalog).close(); + tableLoader.close(); + } catch (Exception e) { + throw new RuntimeException("Failed to close catalog resource"); + } + } + + public TableLoader tableLoader() { + return tableLoader; + } + + public Catalog catalog() { + return catalog; + } + + public CatalogLoader catalogLoader() { + return catalogLoader; + } + + public String warehouse() { + return warehouse; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java new file mode 100644 index 000000000000..dc6ef400a4a9 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; +import org.junit.jupiter.api.extension.ExtensionContext; + +public class HadoopTableExtension extends HadoopCatalogExtension { + private final Schema schema; + private final PartitionSpec partitionSpec; + + private Table table; + + public HadoopTableExtension(String database, String tableName, Schema schema) { + this(database, tableName, schema, null); + } + + public HadoopTableExtension( + String database, String tableName, Schema schema, PartitionSpec partitionSpec) { + super(database, tableName); + this.schema = schema; + this.partitionSpec = partitionSpec; + } + + @Override + public void beforeEach(ExtensionContext context) throws Exception { + super.beforeEach(context); + if (partitionSpec == null) { + this.table = catalog.createTable(TableIdentifier.of(database, tableName), schema); + } else { + this.table = + catalog.createTable(TableIdentifier.of(database, tableName), schema, partitionSpec); + } + tableLoader.open(); + } + + public Table table() { + return table; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java new file mode 100644 index 000000000000..d2e086aa448e --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.runtime.testutils.InMemoryReporter; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.test.junit5.MiniClusterExtension; + +public class MiniFlinkClusterExtension { + + private static final int DEFAULT_TM_NUM = 1; + private static final int DEFAULT_PARALLELISM = 4; + + public static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = + new Configuration() + // disable classloader check as Avro may cache class/object in the serializers. + .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); + + private MiniFlinkClusterExtension() {} + + /** + * It will start a mini cluster with classloader.check-leaked-classloader=false, so that we won't + * break the unit tests because of the class loader leak issue. In our iceberg integration tests, + * there're some that will assert the results after finished the flink jobs, so actually we may + * access the class loader that has been closed by the flink task managers if we enable the switch + * classloader.check-leaked-classloader by default. + */ + public static MiniClusterExtension createWithClassloaderCheckDisabled() { + return new MiniClusterExtension( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(DEFAULT_TM_NUM) + .setNumberSlotsPerTaskManager(DEFAULT_PARALLELISM) + .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) + .build()); + } + + public static MiniClusterExtension createWithClassloaderCheckDisabled( + InMemoryReporter inMemoryReporter) { + Configuration configuration = new Configuration(DISABLE_CLASSLOADER_CHECK_CONFIG); + inMemoryReporter.addToConfiguration(configuration); + + return new MiniClusterExtension( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(DEFAULT_TM_NUM) + .setNumberSlotsPerTaskManager(DEFAULT_PARALLELISM) + .setConfiguration(configuration) + .build()); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java new file mode 100644 index 000000000000..a79406b75cf2 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.OffsetDateTime; +import java.time.ZoneOffset; +import java.time.temporal.ChronoUnit; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.concurrent.TimeUnit; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; + +public class RowDataConverter { + private static final OffsetDateTime EPOCH = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC); + private static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); + + private RowDataConverter() {} + + public static RowData convert(Schema iSchema, Record record) { + return convert(iSchema.asStruct(), record); + } + + private static RowData convert(Types.StructType struct, Record record) { + GenericRowData rowData = new GenericRowData(struct.fields().size()); + List fields = struct.fields(); + for (int i = 0; i < fields.size(); i += 1) { + Types.NestedField field = fields.get(i); + + Type fieldType = field.type(); + rowData.setField(i, convert(fieldType, record.get(i))); + } + return rowData; + } + + private static Object convert(Type type, Object object) { + if (object == null) { + return null; + } + + switch (type.typeId()) { + case BOOLEAN: + case INTEGER: + case LONG: + case FLOAT: + case DOUBLE: + case FIXED: + return object; + case DATE: + return (int) ChronoUnit.DAYS.between(EPOCH_DAY, (LocalDate) object); + case TIME: + // Iceberg's time is in microseconds, while flink's time is in milliseconds. + LocalTime localTime = (LocalTime) object; + return (int) TimeUnit.NANOSECONDS.toMillis(localTime.toNanoOfDay()); + case TIMESTAMP: + return convertTimestamp(object, ((Types.TimestampType) type).shouldAdjustToUTC()); + case TIMESTAMP_NANO: + return convertTimestamp(object, ((Types.TimestampNanoType) type).shouldAdjustToUTC()); + case STRING: + return StringData.fromString((String) object); + case UUID: + UUID uuid = (UUID) object; + ByteBuffer bb = ByteBuffer.allocate(16); + bb.putLong(uuid.getMostSignificantBits()); + bb.putLong(uuid.getLeastSignificantBits()); + return bb.array(); + case BINARY: + ByteBuffer buffer = (ByteBuffer) object; + return Arrays.copyOfRange( + buffer.array(), + buffer.arrayOffset() + buffer.position(), + buffer.arrayOffset() + buffer.remaining()); + case DECIMAL: + Types.DecimalType decimalType = (Types.DecimalType) type; + return DecimalData.fromBigDecimal( + (BigDecimal) object, decimalType.precision(), decimalType.scale()); + case STRUCT: + return convert(type.asStructType(), (Record) object); + case LIST: + List list = (List) object; + Object[] convertedArray = new Object[list.size()]; + for (int i = 0; i < convertedArray.length; i++) { + convertedArray[i] = convert(type.asListType().elementType(), list.get(i)); + } + return new GenericArrayData(convertedArray); + case MAP: + Map convertedMap = Maps.newLinkedHashMap(); + Map map = (Map) object; + for (Map.Entry entry : map.entrySet()) { + convertedMap.put( + convert(type.asMapType().keyType(), entry.getKey()), + convert(type.asMapType().valueType(), entry.getValue())); + } + return new GenericMapData(convertedMap); + default: + throw new UnsupportedOperationException("Not a supported type: " + type); + } + } + + private static TimestampData convertTimestamp(Object timestamp, boolean shouldAdjustToUTC) { + if (shouldAdjustToUTC) { + return TimestampData.fromEpochMillis( + ((OffsetDateTime) timestamp).toInstant().toEpochMilli(), + ((OffsetDateTime) timestamp).getNano() % 1_000_000); + } else { + return TimestampData.fromEpochMillis( + ((LocalDateTime) timestamp).toInstant(ZoneOffset.UTC).toEpochMilli(), + ((LocalDateTime) timestamp).getNano() % 1_000_000); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java new file mode 100644 index 000000000000..d9c9f7ad3f02 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java @@ -0,0 +1,469 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.hadoop.HadoopOutputFile.fromPath; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.time.Duration; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.legacy.api.TableSchema; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.RowKind; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableScan; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.IcebergGenerics; +import org.apache.iceberg.data.InternalRecordWrapper; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.deletes.EqualityDeleteWriter; +import org.apache.iceberg.deletes.PositionDelete; +import org.apache.iceberg.deletes.PositionDeleteWriter; +import org.apache.iceberg.encryption.EncryptedOutputFile; +import org.apache.iceberg.flink.sink.FlinkAppenderFactory; +import org.apache.iceberg.hadoop.HadoopInputFile; +import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.Pair; +import org.apache.iceberg.util.StructLikeSet; +import org.apache.iceberg.util.StructLikeWrapper; +import org.awaitility.Awaitility; + +public class SimpleDataUtil { + + private SimpleDataUtil() {} + + public static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); + + public static final Schema SCHEMA2 = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "extra", Types.StringType.get())); + + public static final ResolvedSchema FLINK_SCHEMA = + ResolvedSchema.of( + Column.physical("id", DataTypes.INT()), Column.physical("data", DataTypes.STRING())); + + public static final TableSchema FLINK_TABLE_SCHEMA = TableSchema.fromResolvedSchema(FLINK_SCHEMA); + + public static final RowType ROW_TYPE = + (RowType) FLINK_SCHEMA.toSourceRowDataType().getLogicalType(); + + public static final Record RECORD = GenericRecord.create(SCHEMA); + public static final Record RECORD2 = GenericRecord.create(SCHEMA2); + + public static Table createTable( + String path, Map properties, boolean partitioned) { + PartitionSpec spec; + if (partitioned) { + spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); + } else { + spec = PartitionSpec.unpartitioned(); + } + return new HadoopTables().create(SCHEMA, spec, properties, path); + } + + public static Record createRecord(Integer id, String data) { + Record record = RECORD.copy(); + record.setField("id", id); + record.setField("data", data); + return record; + } + + public static Record createRecord(Integer id, String data, String extra) { + Record record = RECORD2.copy(); + record.setField("id", id); + record.setField("data", data); + record.setField("extra", extra); + return record; + } + + public static RowData createRowData(Integer id, String data) { + return GenericRowData.of(id, StringData.fromString(data)); + } + + public static RowData createInsert(Integer id, String data) { + return GenericRowData.ofKind(RowKind.INSERT, id, StringData.fromString(data)); + } + + public static RowData createDelete(Integer id, String data) { + return GenericRowData.ofKind(RowKind.DELETE, id, StringData.fromString(data)); + } + + public static RowData createUpdateBefore(Integer id, String data) { + return GenericRowData.ofKind(RowKind.UPDATE_BEFORE, id, StringData.fromString(data)); + } + + public static RowData createUpdateAfter(Integer id, String data) { + return GenericRowData.ofKind(RowKind.UPDATE_AFTER, id, StringData.fromString(data)); + } + + public static DataFile writeFile( + Table table, + Schema schema, + PartitionSpec spec, + Configuration conf, + String location, + String filename, + List rows) + throws IOException { + return writeFile(table, schema, spec, conf, location, filename, rows, null); + } + + /** Write the list of {@link RowData} to the given path and with the given partition data */ + public static DataFile writeFile( + Table table, + Schema schema, + PartitionSpec spec, + Configuration conf, + String location, + String filename, + List rows, + StructLike partition) + throws IOException { + Path path = new Path(location, filename); + FileFormat fileFormat = FileFormat.fromFileName(filename); + Preconditions.checkNotNull(fileFormat, "Cannot determine format for file: %s", filename); + + RowType flinkSchema = FlinkSchemaUtil.convert(schema); + FileAppenderFactory appenderFactory = + new FlinkAppenderFactory( + table, schema, flinkSchema, ImmutableMap.of(), spec, null, null, null); + + FileAppender appender = appenderFactory.newAppender(fromPath(path, conf), fileFormat); + try (FileAppender closeableAppender = appender) { + closeableAppender.addAll(rows); + } + + DataFiles.Builder builder = + DataFiles.builder(spec) + .withInputFile(HadoopInputFile.fromPath(path, conf)) + .withMetrics(appender.metrics()); + + if (partition != null) { + builder = builder.withPartition(partition); + } + + return builder.build(); + } + + public static DeleteFile writeEqDeleteFile( + Table table, + FileFormat format, + String filename, + FileAppenderFactory appenderFactory, + List deletes) + throws IOException { + EncryptedOutputFile outputFile = + table + .encryption() + .encrypt(fromPath(new Path(table.location(), filename), new Configuration())); + + EqualityDeleteWriter eqWriter = + appenderFactory.newEqDeleteWriter(outputFile, format, null); + try (EqualityDeleteWriter writer = eqWriter) { + writer.write(deletes); + } + return eqWriter.toDeleteFile(); + } + + public static DeleteFile writePosDeleteFile( + Table table, + FileFormat format, + String filename, + FileAppenderFactory appenderFactory, + List> positions) + throws IOException { + EncryptedOutputFile outputFile = + table + .encryption() + .encrypt(fromPath(new Path(table.location(), filename), new Configuration())); + + PositionDeleteWriter posWriter = + appenderFactory.newPosDeleteWriter(outputFile, format, null); + PositionDelete posDelete = PositionDelete.create(); + try (PositionDeleteWriter writer = posWriter) { + for (Pair p : positions) { + writer.write(posDelete.set(p.first(), p.second(), null)); + } + } + return posWriter.toDeleteFile(); + } + + private static List convertToRecords(List rows) { + List records = Lists.newArrayList(); + for (RowData row : rows) { + Integer id = row.isNullAt(0) ? null : row.getInt(0); + String data = row.isNullAt(1) ? null : row.getString(1).toString(); + if (row.getArity() == 2) { + records.add(createRecord(id, data)); + } else { + String extra = row.isNullAt(2) ? null : row.getString(2).toString(); + records.add(createRecord(id, data, extra)); + } + } + return records; + } + + public static void assertTableRows(String tablePath, List expected, String branch) + throws IOException { + assertTableRecords(tablePath, convertToRecords(expected), branch); + } + + public static void assertTableRows(Table table, List expected) throws IOException { + assertTableRecords(table, convertToRecords(expected), SnapshotRef.MAIN_BRANCH); + } + + public static void assertTableRows(Table table, List expected, String branch) + throws IOException { + assertTableRecords(table, convertToRecords(expected), branch); + } + + /** Get all rows for a table */ + public static List tableRecords(Table table) throws IOException { + table.refresh(); + List records = Lists.newArrayList(); + try (CloseableIterable iterable = IcebergGenerics.read(table).build()) { + for (Record record : iterable) { + records.add(record); + } + } + return records; + } + + public static boolean equalsRecords(List expected, List actual, Schema schema) { + if (expected.size() != actual.size()) { + return false; + } + Types.StructType type = schema.asStruct(); + StructLikeSet expectedSet = StructLikeSet.create(type); + expectedSet.addAll(expected); + StructLikeSet actualSet = StructLikeSet.create(type); + actualSet.addAll(actual); + return expectedSet.equals(actualSet); + } + + public static void assertRecordsEqual(List expected, List actual, Schema schema) { + assertThat(actual).hasSameSizeAs(expected); + Types.StructType type = schema.asStruct(); + StructLikeSet expectedSet = StructLikeSet.create(type); + expectedSet.addAll(expected); + StructLikeSet actualSet = StructLikeSet.create(type); + actualSet.addAll(actual); + assertThat(actualSet).containsExactlyInAnyOrderElementsOf(expectedSet); + } + + /** + * Assert table contains the expected list of records after waiting up to the configured {@code + * timeout} + */ + public static void assertTableRecords(Table table, List expected, Duration timeout) { + Awaitility.await("expected list of records should be produced") + .atMost(timeout) + .untilAsserted(() -> assertRecordsEqual(expected, tableRecords(table), table.schema())); + } + + public static void assertTableRecords(Table table, List expected) throws IOException { + assertTableRecords(table, expected, SnapshotRef.MAIN_BRANCH); + } + + public static void assertTableRecords(Table table, List expected, String branch) + throws IOException { + table.refresh(); + Snapshot snapshot = latestSnapshot(table, branch); + + if (snapshot == null) { + assertThat(expected) + .as( + "No snapshot for table '%s', assuming expected data is empty. If that's not the case, the Flink job most likely did not checkpoint.", + table.name()) + .isEmpty(); + return; + } + + Types.StructType type = table.schema().asStruct(); + StructLikeSet expectedSet = StructLikeSet.create(type); + expectedSet.addAll(expected); + + try (CloseableIterable iterable = + IcebergGenerics.read(table).useSnapshot(snapshot.snapshotId()).build()) { + StructLikeSet actualSet = StructLikeSet.create(type); + + for (Record record : iterable) { + actualSet.add(record); + } + + assertThat(actualSet).containsExactlyInAnyOrderElementsOf(expectedSet); + } + } + + // Returns the latest snapshot of the given branch in the table + public static Snapshot latestSnapshot(Table table, String branch) { + // For the main branch, currentSnapshot() is used to validate that the API behavior has + // not changed since that was the API used for validation prior to addition of branches. + if (branch.equals(SnapshotRef.MAIN_BRANCH)) { + return table.currentSnapshot(); + } + + return table.snapshot(branch); + } + + public static void assertTableRecords(String tablePath, List expected) + throws IOException { + Preconditions.checkArgument(expected != null, "expected records shouldn't be null"); + assertTableRecords(new HadoopTables().load(tablePath), expected, SnapshotRef.MAIN_BRANCH); + } + + public static void assertTableRecords(String tablePath, List expected, String branch) + throws IOException { + Preconditions.checkArgument(expected != null, "expected records shouldn't be null"); + assertTableRecords(new HadoopTables().load(tablePath), expected, branch); + } + + public static StructLikeSet expectedRowSet(Table table, Record... records) { + StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); + InternalRecordWrapper wrapper = new InternalRecordWrapper(table.schema().asStruct()); + for (Record record : records) { + set.add(wrapper.copyFor(record)); + } + return set; + } + + public static StructLikeSet actualRowSet(Table table, String... columns) throws IOException { + return actualRowSet(table, null, columns); + } + + public static StructLikeSet actualRowSet(Table table, Long snapshotId, String... columns) + throws IOException { + table.refresh(); + StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); + InternalRecordWrapper wrapper = new InternalRecordWrapper(table.schema().asStruct()); + try (CloseableIterable reader = + IcebergGenerics.read(table) + .useSnapshot(snapshotId == null ? table.currentSnapshot().snapshotId() : snapshotId) + .select(columns) + .build()) { + reader.forEach(record -> set.add(wrapper.copyFor(record))); + } + return set; + } + + public static List partitionDataFiles(Table table, Map partitionValues) + throws IOException { + table.refresh(); + Types.StructType partitionType = table.spec().partitionType(); + + Record partitionRecord = GenericRecord.create(partitionType).copy(partitionValues); + StructLikeWrapper expectedWrapper = + StructLikeWrapper.forType(partitionType).set(partitionRecord); + + List dataFiles = Lists.newArrayList(); + try (CloseableIterable fileScanTasks = table.newScan().planFiles()) { + for (FileScanTask scanTask : fileScanTasks) { + StructLikeWrapper wrapper = + StructLikeWrapper.forType(partitionType).set(scanTask.file().partition()); + + if (expectedWrapper.equals(wrapper)) { + dataFiles.add(scanTask.file()); + } + } + } + + return dataFiles; + } + + public static Map> snapshotToDataFiles(Table table) throws IOException { + table.refresh(); + + Map> result = Maps.newHashMap(); + Snapshot current = table.currentSnapshot(); + while (current != null) { + TableScan tableScan = table.newScan(); + if (current.parentId() != null) { + // Collect the data files that was added only in current snapshot. + tableScan = tableScan.appendsBetween(current.parentId(), current.snapshotId()); + } else { + // Collect the data files that was added in the oldest snapshot. + tableScan = tableScan.useSnapshot(current.snapshotId()); + } + try (CloseableIterable scanTasks = tableScan.planFiles()) { + result.put( + current.snapshotId(), + ImmutableList.copyOf(Iterables.transform(scanTasks, FileScanTask::file))); + } + + // Continue to traverse the parent snapshot if exists. + if (current.parentId() == null) { + break; + } + // Iterate to the parent snapshot. + current = table.snapshot(current.parentId()); + } + return result; + } + + public static List matchingPartitions( + List dataFiles, PartitionSpec partitionSpec, Map partitionValues) { + Types.StructType partitionType = partitionSpec.partitionType(); + Record partitionRecord = GenericRecord.create(partitionType).copy(partitionValues); + StructLikeWrapper expected = StructLikeWrapper.forType(partitionType).set(partitionRecord); + return dataFiles.stream() + .filter( + df -> { + StructLikeWrapper wrapper = + StructLikeWrapper.forType(partitionType).set(df.partition()); + return wrapper.equals(expected); + }) + .collect(Collectors.toList()); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java new file mode 100644 index 000000000000..9411ea4f7d71 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.flink.FlinkCatalogFactory.DEFAULT_CATALOG_NAME; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.Map; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +public abstract class SqlBase { + protected abstract TableEnvironment getTableEnv(); + + protected static TableResult exec(TableEnvironment env, String query, Object... args) { + return env.executeSql(String.format(query, args)); + } + + protected TableResult exec(String query, Object... args) { + return exec(getTableEnv(), query, args); + } + + protected List sql(String query, Object... args) { + TableResult tableResult = exec(query, args); + try (CloseableIterator iter = tableResult.collect()) { + return Lists.newArrayList(iter); + } catch (Exception e) { + throw new RuntimeException("Failed to collect table result", e); + } + } + + protected void assertSameElements(Iterable expected, Iterable actual) { + assertThat(actual).isNotNull().containsExactlyInAnyOrderElementsOf(expected); + } + + protected void assertSameElements(String message, Iterable expected, Iterable actual) { + assertThat(actual).isNotNull().as(message).containsExactlyInAnyOrderElementsOf(expected); + } + + /** + * We can not drop currently used catalog after FLINK-29677, so we have make sure that we do not + * use the current catalog before dropping it. This method switches to the 'default_catalog' and + * drops the one requested. + * + * @param catalogName The catalog to drop + * @param ifExists If we should use the 'IF EXISTS' when dropping the catalog + */ + protected void dropCatalog(String catalogName, boolean ifExists) { + sql("USE CATALOG %s", DEFAULT_CATALOG_NAME); + sql("DROP CATALOG %s %s", ifExists ? "IF EXISTS" : "", catalogName); + } + + /** + * We can not drop currently used database after FLINK-33226, so we have make sure that we do not + * use the current database before dropping it. This method switches to the default database in + * the default catalog, and then it and drops the one requested. + * + * @param database The database to drop + * @param ifExists If we should use the 'IF EXISTS' when dropping the database + */ + protected void dropDatabase(String database, boolean ifExists) { + String currentCatalog = getTableEnv().getCurrentCatalog(); + sql("USE CATALOG %s", DEFAULT_CATALOG_NAME); + sql("USE %s", getTableEnv().listDatabases()[0]); + sql("USE CATALOG %s", currentCatalog); + sql("DROP DATABASE %s %s", ifExists ? "IF EXISTS" : "", database); + } + + protected static String toWithClause(Map props) { + StringBuilder builder = new StringBuilder(); + builder.append("("); + int propCount = 0; + for (Map.Entry entry : props.entrySet()) { + if (propCount > 0) { + builder.append(","); + } + builder + .append("'") + .append(entry.getKey()) + .append("'") + .append("=") + .append("'") + .append(entry.getValue()) + .append("'"); + propCount++; + } + builder.append(")"); + return builder.toString(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestBase.java new file mode 100644 index 000000000000..401960c3591b --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestBase.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.flink.FlinkCatalogFactory.DEFAULT_CATALOG_NAME; +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.file.Path; +import java.util.List; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.iceberg.CatalogUtil; +import org.apache.iceberg.hive.HiveCatalog; +import org.apache.iceberg.hive.TestHiveMetastore; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +public abstract class TestBase extends SqlBase { + + @RegisterExtension + public static MiniClusterExtension miniClusterExtension = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + @TempDir protected Path temporaryDirectory; + + private static TestHiveMetastore metastore = null; + protected static HiveConf hiveConf = null; + protected static HiveCatalog catalog = null; + + private volatile TableEnvironment tEnv = null; + + @BeforeAll + public static void startMetastore() { + TestBase.metastore = new TestHiveMetastore(); + metastore.start(); + TestBase.hiveConf = metastore.hiveConf(); + TestBase.catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + } + + @AfterAll + public static void stopMetastore() throws Exception { + metastore.stop(); + TestBase.catalog = null; + } + + @Override + protected TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + if (tEnv == null) { + EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build(); + + TableEnvironment env = TableEnvironment.create(settings); + env.getConfig() + .getConfiguration() + .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); + tEnv = env; + } + } + } + return tEnv; + } + + protected static TableResult exec(TableEnvironment env, String query, Object... args) { + return env.executeSql(String.format(query, args)); + } + + protected TableResult exec(String query, Object... args) { + return exec(getTableEnv(), query, args); + } + + protected List sql(String query, Object... args) { + TableResult tableResult = exec(query, args); + try (CloseableIterator iter = tableResult.collect()) { + return Lists.newArrayList(iter); + } catch (Exception e) { + throw new RuntimeException("Failed to collect table result", e); + } + } + + protected void assertSameElements(Iterable expected, Iterable actual) { + assertThat(actual).isNotNull().containsExactlyInAnyOrderElementsOf(expected); + } + + protected void assertSameElements(String message, Iterable expected, Iterable actual) { + assertThat(actual).isNotNull().as(message).containsExactlyInAnyOrderElementsOf(expected); + } + + /** + * We can not drop currently used catalog after FLINK-29677, so we have make sure that we do not + * use the current catalog before dropping it. This method switches to the 'default_catalog' and + * drops the one requested. + * + * @param catalogName The catalog to drop + * @param ifExists If we should use the 'IF EXISTS' when dropping the catalog + */ + protected void dropCatalog(String catalogName, boolean ifExists) { + sql("USE CATALOG %s", DEFAULT_CATALOG_NAME); + sql("DROP CATALOG %s %s", ifExists ? "IF EXISTS" : "", catalogName); + } + + /** + * We can not drop currently used database after FLINK-33226, so we have make sure that we do not + * use the current database before dropping it. This method switches to the default database in + * the default catalog, and then it and drops the one requested. + * + * @param database The database to drop + * @param ifExists If we should use the 'IF EXISTS' when dropping the database + */ + protected void dropDatabase(String database, boolean ifExists) { + String currentCatalog = getTableEnv().getCurrentCatalog(); + sql("USE CATALOG %s", DEFAULT_CATALOG_NAME); + sql("USE %s", getTableEnv().listDatabases()[0]); + sql("USE CATALOG %s", currentCatalog); + sql("DROP DATABASE %s %s", ifExists ? "IF EXISTS" : "", database); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java new file mode 100644 index 000000000000..e8f65921c19a --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.CatalogProperties.URI; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.entry; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.util.Map; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.hadoop.HadoopFileIO; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +/** Test for {@link CatalogLoader}. */ +public class TestCatalogLoader extends TestBase { + + private static File warehouse = null; + private static final TableIdentifier IDENTIFIER = TableIdentifier.of("default", "my_table"); + private static final Schema SCHEMA = + new Schema(Types.NestedField.required(1, "f1", Types.StringType.get())); + + @BeforeAll + public static void createWarehouse() throws IOException { + warehouse = File.createTempFile("warehouse", null); + assertThat(warehouse.delete()).isTrue(); + hiveConf.set("my_key", "my_value"); + } + + @AfterAll + public static void dropWarehouse() throws IOException { + if (warehouse != null && warehouse.exists()) { + Path warehousePath = new Path(warehouse.getAbsolutePath()); + FileSystem fs = warehousePath.getFileSystem(hiveConf); + assertThat(fs.delete(warehousePath, true)).as("Failed to delete " + warehousePath).isTrue(); + } + } + + @Test + public void testHadoopCatalogLoader() throws IOException, ClassNotFoundException { + Map properties = Maps.newHashMap(); + properties.put(CatalogProperties.WAREHOUSE_LOCATION, "file:" + warehouse); + CatalogLoader loader = CatalogLoader.hadoop("my_catalog", hiveConf, properties); + validateCatalogLoader(loader); + } + + @Test + public void testHiveCatalogLoader() throws IOException, ClassNotFoundException { + CatalogLoader loader = CatalogLoader.hive("my_catalog", hiveConf, Maps.newHashMap()); + validateCatalogLoader(loader); + } + + @Test + public void testRESTCatalogLoader() { + Map properties = Maps.newHashMap(); + properties.put(URI, "http://localhost/"); + CatalogLoader.rest("my_catalog", hiveConf, Maps.newHashMap()); + } + + private static void validateCatalogLoader(CatalogLoader loader) + throws IOException, ClassNotFoundException { + Table table = javaSerAndDeSer(loader).loadCatalog().createTable(IDENTIFIER, SCHEMA); + validateHadoopConf(table); + } + + private static void validateHadoopConf(Table table) { + FileIO io = table.io(); + assertThat(io).as("FileIO should be a HadoopFileIO").isInstanceOf(HadoopFileIO.class); + HadoopFileIO hadoopIO = (HadoopFileIO) io; + assertThat(hadoopIO.conf()).contains(entry("my_key", "my_value")); + } + + @SuppressWarnings("unchecked") + private static T javaSerAndDeSer(T object) throws IOException, ClassNotFoundException { + ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { + out.writeObject(object); + } + + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + return (T) in.readObject(); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java new file mode 100644 index 000000000000..f719c7bc0001 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.hadoop.HadoopFileIO; +import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +/** Test for {@link TableLoader}. */ +public class TestCatalogTableLoader extends TestBase { + + private static File warehouse = null; + private static final TableIdentifier IDENTIFIER = TableIdentifier.of("default", "my_table"); + private static final Schema SCHEMA = + new Schema(Types.NestedField.required(1, "f1", Types.StringType.get())); + + @BeforeAll + public static void createWarehouse() throws IOException { + warehouse = File.createTempFile("warehouse", null); + assertThat(warehouse.delete()).isTrue(); + hiveConf.set("my_key", "my_value"); + } + + @AfterAll + public static void dropWarehouse() throws IOException { + if (warehouse != null && warehouse.exists()) { + Path warehousePath = new Path(warehouse.getAbsolutePath()); + FileSystem fs = warehousePath.getFileSystem(hiveConf); + assertThat(fs.delete(warehousePath, true)).as("Failed to delete " + warehousePath).isTrue(); + } + } + + @Test + public void testHadoopTableLoader() throws IOException, ClassNotFoundException { + String location = "file:" + warehouse + "/my_table"; + new HadoopTables(hiveConf).create(SCHEMA, location); + validateTableLoader(TableLoader.fromHadoopTable(location, hiveConf)); + } + + @Test + public void testHiveCatalogTableLoader() throws IOException, ClassNotFoundException { + CatalogLoader loader = CatalogLoader.hive("my_catalog", hiveConf, Maps.newHashMap()); + javaSerdes(loader).loadCatalog().createTable(IDENTIFIER, SCHEMA); + + CatalogLoader catalogLoader = CatalogLoader.hive("my_catalog", hiveConf, Maps.newHashMap()); + validateTableLoader(TableLoader.fromCatalog(catalogLoader, IDENTIFIER)); + } + + private static void validateTableLoader(TableLoader loader) + throws IOException, ClassNotFoundException { + TableLoader copied = javaSerdes(loader); + copied.open(); + try { + validateHadoopConf(copied.loadTable()); + } finally { + copied.close(); + } + } + + private static void validateHadoopConf(Table table) { + FileIO io = table.io(); + assertThat(io).as("FileIO should be a HadoopFileIO").isInstanceOf(HadoopFileIO.class); + HadoopFileIO hadoopIO = (HadoopFileIO) io; + assertThat(hadoopIO.conf().get("my_key")).isEqualTo("my_value"); + } + + @SuppressWarnings("unchecked") + private static T javaSerdes(T object) throws IOException, ClassNotFoundException { + ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { + out.writeObject(object); + } + + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + return (T) in.readObject(); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java new file mode 100644 index 000000000000..1997ef6998a2 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import org.apache.flink.types.Row; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.BaseTable; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableOperations; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.source.BoundedTableFactory; +import org.apache.iceberg.flink.source.ChangeLogTableTestBase; +import org.apache.iceberg.relocated.com.google.common.base.Joiner; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.StructLikeSet; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +/** + * In this test case, we mainly cover the impact of primary key selection, multiple operations + * within a single transaction, and multiple operations between different txn on the correctness of + * the data. + */ +@ExtendWith(ParameterizedTestExtension.class) +public class TestChangeLogTable extends ChangeLogTableTestBase { + private static final Configuration CONF = new Configuration(); + private static final String SOURCE_TABLE = "default_catalog.default_database.source_change_logs"; + + private static final String CATALOG_NAME = "test_catalog"; + private static final String DATABASE_NAME = "test_db"; + private static final String TABLE_NAME = "test_table"; + private String warehouse; + + @Parameter private boolean partitioned; + + @Parameters(name = "PartitionedTable={0}") + public static Iterable parameters() { + return ImmutableList.of(new Object[] {true}, new Object[] {false}); + } + + @BeforeEach + public void before() throws IOException { + File warehouseFile = File.createTempFile("junit", null, temporaryDirectory.toFile()); + assertThat(warehouseFile.delete()).isTrue(); + warehouse = String.format("file:%s", warehouseFile); + + sql( + "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + CATALOG_NAME, warehouse); + sql("USE CATALOG %s", CATALOG_NAME); + sql("CREATE DATABASE %s", DATABASE_NAME); + sql("USE %s", DATABASE_NAME); + // Set the table.exec.sink.upsert-materialize=NONE, so that downstream operators will receive + // the + // records with the same order as the source operator, bypassing Flink's inferred shuffle. + getTableEnv().getConfig().set("table.exec.sink.upsert-materialize", "NONE"); + } + + @AfterEach + @Override + public void clean() { + sql("DROP TABLE IF EXISTS %s", TABLE_NAME); + dropDatabase(DATABASE_NAME, true); + dropCatalog(CATALOG_NAME, true); + BoundedTableFactory.clearDataSets(); + } + + @TestTemplate + public void testSqlChangeLogOnIdKey() throws Exception { + List> inputRowsPerCheckpoint = + ImmutableList.of( + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(1, "bbb"), + insertRow(2, "aaa"), + deleteRow(2, "aaa"), + insertRow(2, "bbb")), + ImmutableList.of( + updateBeforeRow(2, "bbb"), + updateAfterRow(2, "ccc"), + deleteRow(2, "ccc"), + insertRow(2, "ddd")), + ImmutableList.of( + deleteRow(1, "bbb"), + insertRow(1, "ccc"), + deleteRow(1, "ccc"), + insertRow(1, "ddd"))); + + List> expectedRecordsPerCheckpoint = + ImmutableList.of( + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "bbb")), + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "ddd")), + ImmutableList.of(insertRow(1, "ddd"), insertRow(2, "ddd"))); + + testSqlChangeLog( + TABLE_NAME, ImmutableList.of("id"), inputRowsPerCheckpoint, expectedRecordsPerCheckpoint); + } + + @TestTemplate + public void testChangeLogOnDataKey() throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(2, "bbb"), + insertRow(1, "bbb"), + insertRow(2, "aaa")), + ImmutableList.of( + updateBeforeRow(2, "aaa"), updateAfterRow(1, "ccc"), insertRow(1, "aaa")), + ImmutableList.of(deleteRow(1, "bbb"), insertRow(2, "aaa"), insertRow(2, "ccc"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa")), + ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc")), + ImmutableList.of( + insertRow(1, "aaa"), + insertRow(1, "ccc"), + insertRow(2, "aaa"), + insertRow(2, "ccc"))); + + testSqlChangeLog(TABLE_NAME, ImmutableList.of("data"), elementsPerCheckpoint, expectedRecords); + } + + @TestTemplate + public void testChangeLogOnIdDataKey() throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(2, "bbb"), + insertRow(1, "bbb"), + insertRow(2, "aaa")), + ImmutableList.of( + updateBeforeRow(2, "aaa"), updateAfterRow(1, "ccc"), insertRow(1, "aaa")), + ImmutableList.of(deleteRow(1, "bbb"), insertRow(2, "aaa"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa"), insertRow(2, "bbb")), + ImmutableList.of( + insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc"), insertRow(2, "bbb")), + ImmutableList.of( + insertRow(1, "aaa"), + insertRow(1, "ccc"), + insertRow(2, "aaa"), + insertRow(2, "bbb"))); + + testSqlChangeLog( + TABLE_NAME, ImmutableList.of("data", "id"), elementsPerCheckpoint, expectedRecords); + } + + @TestTemplate + public void testPureInsertOnIdKey() throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of(insertRow(1, "aaa"), insertRow(2, "bbb")), + ImmutableList.of(insertRow(3, "ccc"), insertRow(4, "ddd")), + ImmutableList.of(insertRow(5, "eee"), insertRow(6, "fff"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(insertRow(1, "aaa"), insertRow(2, "bbb")), + ImmutableList.of( + insertRow(1, "aaa"), insertRow(2, "bbb"), insertRow(3, "ccc"), insertRow(4, "ddd")), + ImmutableList.of( + insertRow(1, "aaa"), + insertRow(2, "bbb"), + insertRow(3, "ccc"), + insertRow(4, "ddd"), + insertRow(5, "eee"), + insertRow(6, "fff"))); + + testSqlChangeLog(TABLE_NAME, ImmutableList.of("data"), elementsPerCheckpoint, expectedRecords); + } + + private static Record record(int id, String data) { + return SimpleDataUtil.createRecord(id, data); + } + + private Table createTable(String tableName, List key, boolean isPartitioned) { + String partitionByCause = isPartitioned ? "PARTITIONED BY (data)" : ""; + sql( + "CREATE TABLE %s(id INT, data VARCHAR, PRIMARY KEY(%s) NOT ENFORCED) %s", + tableName, Joiner.on(',').join(key), partitionByCause); + + // Upgrade the iceberg table to format v2. + CatalogLoader loader = + CatalogLoader.hadoop( + "my_catalog", CONF, ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouse)); + Table table = loader.loadCatalog().loadTable(TableIdentifier.of(DATABASE_NAME, TABLE_NAME)); + TableOperations ops = ((BaseTable) table).operations(); + TableMetadata meta = ops.current(); + ops.commit(meta, meta.upgradeToFormatVersion(2)); + + return table; + } + + private void testSqlChangeLog( + String tableName, + List key, + List> inputRowsPerCheckpoint, + List> expectedRecordsPerCheckpoint) + throws Exception { + String dataId = BoundedTableFactory.registerDataSet(inputRowsPerCheckpoint); + sql( + "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + + " WITH ('connector'='BoundedSource', 'data-id'='%s')", + SOURCE_TABLE, dataId); + + assertThat(sql("SELECT * FROM %s", SOURCE_TABLE)).isEqualTo(listJoin(inputRowsPerCheckpoint)); + + Table table = createTable(tableName, key, partitioned); + sql("INSERT INTO %s SELECT * FROM %s", tableName, SOURCE_TABLE); + + table.refresh(); + List snapshots = findValidSnapshots(table); + int expectedSnapshotNum = expectedRecordsPerCheckpoint.size(); + assertThat(snapshots) + .as("Should have the expected snapshot number") + .hasSameSizeAs(expectedRecordsPerCheckpoint); + + for (int i = 0; i < expectedSnapshotNum; i++) { + long snapshotId = snapshots.get(i).snapshotId(); + List expectedRows = expectedRecordsPerCheckpoint.get(i); + assertThat(actualRowSet(table, snapshotId)) + .as("Should have the expected records for the checkpoint#" + i) + .isEqualTo(expectedRowSet(table, expectedRows)); + } + + if (expectedSnapshotNum > 0) { + assertThat(sql("SELECT * FROM %s", tableName)) + .as("Should have the expected rows in the final table") + .containsExactlyInAnyOrderElementsOf( + expectedRecordsPerCheckpoint.get(expectedSnapshotNum - 1)); + } + } + + private List findValidSnapshots(Table table) { + List validSnapshots = Lists.newArrayList(); + for (Snapshot snapshot : table.snapshots()) { + if (snapshot.allManifests(table.io()).stream() + .anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { + validSnapshots.add(snapshot); + } + } + return validSnapshots; + } + + private static StructLikeSet expectedRowSet(Table table, List rows) { + Record[] records = new Record[rows.size()]; + for (int i = 0; i < records.length; i++) { + records[i] = record((int) rows.get(i).getField(0), (String) rows.get(i).getField(1)); + } + return SimpleDataUtil.expectedRowSet(table, records); + } + + private static StructLikeSet actualRowSet(Table table, long snapshotId) throws IOException { + return SimpleDataUtil.actualRowSet(table, snapshotId, "*"); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java new file mode 100644 index 000000000000..fa8c09f66651 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Map; +import org.apache.flink.api.common.serialization.SerializerConfigImpl; +import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileMetadata; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestDataFileSerialization { + + private static final Schema DATE_SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); + + private static final PartitionSpec PARTITION_SPEC = + PartitionSpec.builderFor(DATE_SCHEMA).identity("date").build(); + + private static final Map COLUMN_SIZES = Maps.newHashMap(); + private static final Map VALUE_COUNTS = Maps.newHashMap(); + private static final Map NULL_VALUE_COUNTS = Maps.newHashMap(); + private static final Map NAN_VALUE_COUNTS = Maps.newHashMap(); + private static final Map LOWER_BOUNDS = Maps.newHashMap(); + private static final Map UPPER_BOUNDS = Maps.newHashMap(); + + static { + COLUMN_SIZES.put(1, 2L); + COLUMN_SIZES.put(2, 3L); + VALUE_COUNTS.put(1, 5L); + VALUE_COUNTS.put(2, 3L); + VALUE_COUNTS.put(4, 2L); + NULL_VALUE_COUNTS.put(1, 0L); + NULL_VALUE_COUNTS.put(2, 2L); + NAN_VALUE_COUNTS.put(4, 1L); + LOWER_BOUNDS.put(1, longToBuffer(0L)); + UPPER_BOUNDS.put(1, longToBuffer(4L)); + } + + private static final Metrics METRICS = + new Metrics( + 5L, null, VALUE_COUNTS, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS); + + private static final DataFile DATA_FILE = + DataFiles.builder(PARTITION_SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(1234) + .withPartitionPath("date=2018-06-08") + .withMetrics(METRICS) + .withSplitOffsets(ImmutableList.of(4L)) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) + .withSortOrder(SortOrder.unsorted()) + .build(); + + private static final DeleteFile POS_DELETE_FILE = + FileMetadata.deleteFileBuilder(PARTITION_SPEC) + .ofPositionDeletes() + .withPath("/path/to/pos-delete.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("date=2018-06-08") + .withMetrics(METRICS) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) + .withRecordCount(23) + .build(); + + private static final DeleteFile EQ_DELETE_FILE = + FileMetadata.deleteFileBuilder(PARTITION_SPEC) + .ofEqualityDeletes(2, 3) + .withPath("/path/to/equality-delete.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("date=2018-06-08") + .withMetrics(METRICS) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) + .withRecordCount(23) + .withSortOrder(SortOrder.unsorted()) + .build(); + + @Test + public void testJavaSerialization() throws Exception { + ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { + out.writeObject(DATA_FILE); + out.writeObject(DATA_FILE.copy()); + + out.writeObject(POS_DELETE_FILE); + out.writeObject(POS_DELETE_FILE.copy()); + + out.writeObject(EQ_DELETE_FILE); + out.writeObject(EQ_DELETE_FILE.copy()); + } + + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + for (int i = 0; i < 2; i += 1) { + Object obj = in.readObject(); + assertThat(obj).as("Should be a DataFile").isInstanceOf(DataFile.class); + TestHelpers.assertEquals(DATA_FILE, (DataFile) obj); + } + + for (int i = 0; i < 2; i += 1) { + Object obj = in.readObject(); + assertThat(obj).as("Should be a position DeleteFile").isInstanceOf(DeleteFile.class); + TestHelpers.assertEquals(POS_DELETE_FILE, (DeleteFile) obj); + } + + for (int i = 0; i < 2; i += 1) { + Object obj = in.readObject(); + assertThat(obj).as("Should be a equality DeleteFile").isInstanceOf(DeleteFile.class); + TestHelpers.assertEquals(EQ_DELETE_FILE, (DeleteFile) obj); + } + } + } + + @Test + public void testDataFileKryoSerialization() throws IOException { + KryoSerializer kryo = + new KryoSerializer<>(DataFile.class, new SerializerConfigImpl()); + + DataOutputSerializer outputView = new DataOutputSerializer(1024); + + kryo.serialize(DATA_FILE, outputView); + kryo.serialize(DATA_FILE.copy(), outputView); + + DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); + DataFile dataFile1 = kryo.deserialize(inputView); + DataFile dataFile2 = kryo.deserialize(inputView); + + TestHelpers.assertEquals(DATA_FILE, dataFile1); + TestHelpers.assertEquals(DATA_FILE, dataFile2); + } + + @Test + public void testDeleteFileKryoSerialization() throws IOException { + KryoSerializer kryo = + new KryoSerializer<>(DeleteFile.class, new SerializerConfigImpl()); + + DataOutputSerializer outputView = new DataOutputSerializer(1024); + + kryo.serialize(POS_DELETE_FILE, outputView); + kryo.serialize(POS_DELETE_FILE.copy(), outputView); + + kryo.serialize(EQ_DELETE_FILE, outputView); + kryo.serialize(EQ_DELETE_FILE.copy(), outputView); + + DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); + + DeleteFile posDeleteFile1 = kryo.deserialize(inputView); + DeleteFile posDeleteFile2 = kryo.deserialize(inputView); + + TestHelpers.assertEquals(POS_DELETE_FILE, posDeleteFile1); + TestHelpers.assertEquals(POS_DELETE_FILE, posDeleteFile2); + + DeleteFile eqDeleteFile1 = kryo.deserialize(inputView); + DeleteFile eqDeleteFile2 = kryo.deserialize(inputView); + + TestHelpers.assertEquals(EQ_DELETE_FILE, eqDeleteFile1); + TestHelpers.assertEquals(EQ_DELETE_FILE, eqDeleteFile2); + } + + private static ByteBuffer longToBuffer(long value) { + return ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN).putLong(0, value); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java new file mode 100644 index 000000000000..b9a7d5b1d589 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.types.Types.NestedField.required; + +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.types.Types; + +public class TestFixtures { + + private TestFixtures() {} + + public static final Schema SCHEMA = + new Schema( + required(1, "data", Types.StringType.get()), + required(2, "id", Types.LongType.get()), + required(3, "dt", Types.StringType.get())); + + public static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("dt").bucket("id", 1).build(); + + public static final RowType ROW_TYPE = FlinkSchemaUtil.convert(SCHEMA); + + public static final String DATABASE = "default"; + public static final String TABLE = "t"; + public static final String SINK_TABLE = "t_sink"; + + public static final TableIdentifier TABLE_IDENTIFIER = TableIdentifier.of(DATABASE, TABLE); + public static final TableIdentifier SINK_TABLE_IDENTIFIER = + TableIdentifier.of(DATABASE, SINK_TABLE); + + public static final Schema TS_SCHEMA = + new Schema( + required(1, "ts", Types.TimestampType.withoutZone()), + required(2, "str", Types.StringType.get())); + + public static final PartitionSpec TS_SPEC = + PartitionSpec.builderFor(TS_SCHEMA).hour("ts").build(); + + public static final RowType TS_ROW_TYPE = FlinkSchemaUtil.convert(TS_SCHEMA); +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java new file mode 100644 index 000000000000..70c8043f8fbb --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.nio.file.Files; +import java.util.concurrent.TimeUnit; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.Schema; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.TableDescriptor; +import org.apache.flink.table.api.TableEnvironment; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.Test; + +public class TestFlinkAnonymousTable extends TestBase { + + @Test + public void testWriteAnonymousTable() throws Exception { + File warehouseDir = Files.createTempDirectory(temporaryDirectory, "junit").toFile(); + TableEnvironment tEnv = getTableEnv(); + Table table = + tEnv.from( + TableDescriptor.forConnector("datagen") + .schema(Schema.newBuilder().column("f0", DataTypes.STRING()).build()) + .option("number-of-rows", "3") + .build()); + + TableDescriptor descriptor = + TableDescriptor.forConnector("iceberg") + .schema(Schema.newBuilder().column("f0", DataTypes.STRING()).build()) + .option("catalog-name", "hadoop_test") + .option("catalog-type", "hadoop") + .option("catalog-database", "test_db") + .option("catalog-table", "test") + .option("warehouse", warehouseDir.getAbsolutePath()) + .build(); + + table.insertInto(descriptor).execute(); + Awaitility.await() + .atMost(3, TimeUnit.SECONDS) + .untilAsserted( + () -> + assertThat(warehouseDir.toPath().resolve("test_db").resolve("test").toFile()) + .exists()); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java new file mode 100644 index 000000000000..bd07087756ad --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.nio.file.Path; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; +import org.apache.flink.types.Row; +import org.apache.iceberg.Schema; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.TestTemplate; + +public class TestFlinkCatalogDatabase extends CatalogTestBase { + + @AfterEach + @Override + public void clean() { + sql("DROP TABLE IF EXISTS %s.tl", flinkDatabase); + dropDatabase(flinkDatabase, true); + super.clean(); + } + + @TestTemplate + public void testCreateNamespace() { + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Database should not already exist") + .isFalse(); + + sql("CREATE DATABASE %s", flinkDatabase); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Database should exist") + .isTrue(); + + sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Database should still exist") + .isTrue(); + + dropDatabase(flinkDatabase, true); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Database should be dropped") + .isFalse(); + + sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Database should be created") + .isTrue(); + } + + @TestTemplate + public void testDropEmptyDatabase() { + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should not already exist") + .isFalse(); + sql("CREATE DATABASE %s", flinkDatabase); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should exist") + .isTrue(); + dropDatabase(flinkDatabase, true); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should have been dropped") + .isFalse(); + } + + @TestTemplate + public void testDropNonEmptyNamespace() { + assumeThat(isHadoopCatalog) + .as("Hadoop catalog throws IOException: Directory is not empty.") + .isFalse(); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should not already exist") + .isFalse(); + sql("CREATE DATABASE %s", flinkDatabase); + validationCatalog.createTable( + TableIdentifier.of(icebergNamespace, "tl"), + new Schema(Types.NestedField.optional(0, "id", Types.LongType.get()))); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should exist") + .isTrue(); + assertThat(validationCatalog.tableExists(TableIdentifier.of(icebergNamespace, "tl"))) + .as("Table should exist") + .isTrue(); + assertThatThrownBy(() -> dropDatabase(flinkDatabase, true)) + .cause() + .isInstanceOf(DatabaseNotEmptyException.class) + .hasMessage( + String.format("Database %s in catalog %s is not empty.", DATABASE, catalogName)); + sql("DROP TABLE %s.tl", flinkDatabase); + } + + @TestTemplate + public void testListTables() { + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should not already exist") + .isFalse(); + sql("CREATE DATABASE %s", flinkDatabase); + sql("USE CATALOG %s", catalogName); + sql("USE %s", DATABASE); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should exist") + .isTrue(); + assertThat(sql("SHOW TABLES")).isEmpty(); + validationCatalog.createTable( + TableIdentifier.of(icebergNamespace, "tl"), + new Schema(Types.NestedField.optional(0, "id", Types.LongType.get()))); + + List tables = sql("SHOW TABLES"); + assertThat(tables).hasSize(1); + assertThat("tl").as("Table name should match").isEqualTo(tables.get(0).getField(0)); + } + + @TestTemplate + public void testListNamespace() { + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should not already exist") + .isFalse(); + sql("CREATE DATABASE %s", flinkDatabase); + sql("USE CATALOG %s", catalogName); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should exist") + .isTrue(); + + List databases = sql("SHOW DATABASES"); + + if (isHadoopCatalog) { + assertThat(databases).hasSize(1); + assertThat(databases.get(0).getField(0)).as("Should have db database").isEqualTo("db"); + if (!baseNamespace.isEmpty()) { + // test namespace not belongs to this catalog + validationNamespaceCatalog.createNamespace( + Namespace.of(baseNamespace.level(0), "UNKNOWN_NAMESPACE")); + databases = sql("SHOW DATABASES"); + assertThat(databases).hasSize(1); + assertThat(databases.get(0).getField(0)).as("Should have db database").isEqualTo("db"); + } + } else { + // If there are multiple classes extends FlinkTestBase, TestHiveMetastore may loose the + // creation for default + // database. See HiveMetaStore.HMSHandler.init. + assertThat(databases) + .as("Should have db database") + .anyMatch(d -> Objects.equals(d.getField(0), "db")); + } + } + + @TestTemplate + public void testCreateNamespaceWithMetadata() { + assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isFalse(); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should not already exist") + .isFalse(); + sql("CREATE DATABASE %s WITH ('prop'='value')", flinkDatabase); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should exist") + .isTrue(); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + assertThat(nsMetadata).containsEntry("prop", "value"); + } + + @TestTemplate + public void testCreateNamespaceWithComment() { + assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isFalse(); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should not already exist") + .isFalse(); + + sql("CREATE DATABASE %s COMMENT 'namespace doc'", flinkDatabase); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should exist") + .isTrue(); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + assertThat(nsMetadata).containsEntry("comment", "namespace doc"); + } + + @TestTemplate + public void testCreateNamespaceWithLocation() throws Exception { + assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isFalse(); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should not already exist") + .isFalse(); + + Path location = temporaryDirectory.getRoot(); + sql("CREATE DATABASE %s WITH ('location'='%s')", flinkDatabase, location); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should exist") + .isTrue(); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + assertThat(nsMetadata).containsEntry("location", "file:" + location.getRoot()); + } + + @TestTemplate + public void testSetProperties() { + assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isFalse(); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should not already exist") + .isFalse(); + + sql("CREATE DATABASE %s", flinkDatabase); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should exist") + .isTrue(); + + Map defaultMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + assertThat(defaultMetadata).doesNotContainKey("prop"); + sql("ALTER DATABASE %s SET ('prop'='value')", flinkDatabase); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + assertThat(nsMetadata).containsEntry("prop", "value"); + } + + @TestTemplate + public void testHadoopNotSupportMeta() { + assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isTrue(); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should not already exist") + .isFalse(); + assertThatThrownBy(() -> sql("CREATE DATABASE %s WITH ('prop'='value')", flinkDatabase)) + .cause() + .isInstanceOf(UnsupportedOperationException.class) + .hasMessage( + String.format( + "Cannot create namespace %s: metadata is not supported", icebergNamespace)); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java new file mode 100644 index 000000000000..4c9e95b8fa82 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.Map; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.hadoop.HadoopCatalog; +import org.apache.iceberg.hive.HiveCatalog; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +public class TestFlinkCatalogFactory { + + private Map props; + + @BeforeEach + public void before() { + props = Maps.newHashMap(); + props.put("type", "iceberg"); + props.put(CatalogProperties.WAREHOUSE_LOCATION, "/tmp/location"); + } + + @Test + public void testCreateCatalogHive() { + String catalogName = "hiveCatalog"; + props.put( + FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); + + Catalog catalog = + FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) + .loadCatalog(); + + assertThat(catalog).isNotNull().isInstanceOf(HiveCatalog.class); + } + + @Test + public void testCreateCatalogHadoop() { + String catalogName = "hadoopCatalog"; + props.put( + FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HADOOP); + + Catalog catalog = + FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) + .loadCatalog(); + + assertThat(catalog).isNotNull().isInstanceOf(HadoopCatalog.class); + } + + @Test + public void testCreateCatalogCustom() { + String catalogName = "customCatalog"; + props.put(CatalogProperties.CATALOG_IMPL, CustomHadoopCatalog.class.getName()); + + Catalog catalog = + FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) + .loadCatalog(); + + assertThat(catalog).isNotNull().isInstanceOf(CustomHadoopCatalog.class); + } + + @Test + public void testCreateCatalogCustomWithHiveCatalogTypeSet() { + String catalogName = "customCatalog"; + props.put(CatalogProperties.CATALOG_IMPL, CustomHadoopCatalog.class.getName()); + props.put( + FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); + + assertThatThrownBy( + () -> FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration())) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith( + "Cannot create catalog customCatalog, both catalog-type and catalog-impl are set"); + } + + @Test + public void testLoadCatalogUnknown() { + String catalogName = "unknownCatalog"; + props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "fooType"); + + assertThatThrownBy( + () -> FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration())) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessageStartingWith("Unknown catalog-type: fooType"); + } + + public static class CustomHadoopCatalog extends HadoopCatalog { + + public CustomHadoopCatalog() {} + + public CustomHadoopCatalog(Configuration conf, String warehouseLocation) { + setConf(conf); + initialize( + "custom", ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation)); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java new file mode 100644 index 000000000000..f7848a5d22ef --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java @@ -0,0 +1,722 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.util.Arrays; +import java.util.Collections; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.Schema.UnresolvedPrimaryKey; +import org.apache.flink.table.api.TableException; +import org.apache.flink.table.api.ValidationException; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.CommonCatalogOptions; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.iceberg.BaseTable; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.DataOperations; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableOperations; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.exceptions.NoSuchTableException; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; + +public class TestFlinkCatalogTable extends CatalogTestBase { + + @Override + @BeforeEach + public void before() { + super.before(); + sql("CREATE DATABASE %s", flinkDatabase); + sql("USE CATALOG %s", catalogName); + sql("USE %s", DATABASE); + } + + @AfterEach + public void cleanNamespaces() { + sql("DROP TABLE IF EXISTS %s.tl", flinkDatabase); + sql("DROP TABLE IF EXISTS %s.tl2", flinkDatabase); + dropDatabase(flinkDatabase, true); + super.clean(); + } + + @TestTemplate + public void testGetTable() { + sql("CREATE TABLE tl(id BIGINT, strV STRING)"); + + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, "tl")); + Schema iSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "strV", Types.StringType.get())); + assertThat(table.schema().toString()) + .as("Should load the expected iceberg schema") + .isEqualTo(iSchema.toString()); + } + + @TestTemplate + public void testRenameTable() { + assumeThat(isHadoopCatalog).as("HadoopCatalog does not support rename table").isFalse(); + final Schema tableSchema = + new Schema(Types.NestedField.optional(0, "id", Types.LongType.get())); + validationCatalog.createTable(TableIdentifier.of(icebergNamespace, "tl"), tableSchema); + sql("ALTER TABLE tl RENAME TO tl2"); + + assertThatThrownBy(() -> getTableEnv().from("tl")) + .isInstanceOf(ValidationException.class) + .hasMessage("Table `tl` was not found."); + + Schema actualSchema = FlinkSchemaUtil.convert(getTableEnv().from("tl2").getResolvedSchema()); + assertThat(tableSchema.asStruct()).isEqualTo(actualSchema.asStruct()); + } + + @TestTemplate + public void testCreateTable() throws TableNotExistException { + sql("CREATE TABLE tl(id BIGINT)"); + + Table table = table("tl"); + assertThat(table.schema().asStruct()) + .isEqualTo( + new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); + CatalogTable catalogTable = catalogTable("tl"); + assertThat(catalogTable.getUnresolvedSchema()) + .isEqualTo( + org.apache.flink.table.api.Schema.newBuilder() + .column("id", DataTypes.BIGINT()) + .build()); + } + + @TestTemplate + public void testCreateTableWithPrimaryKey() throws Exception { + sql("CREATE TABLE tl(id BIGINT, data STRING, key STRING PRIMARY KEY NOT ENFORCED)"); + + Table table = table("tl"); + assertThat(table.schema().identifierFieldIds()) + .as("Should have the expected row key.") + .isEqualTo(Sets.newHashSet(table.schema().findField("key").fieldId())); + CatalogTable catalogTable = catalogTable("tl"); + Optional uniqueConstraintOptional = + catalogTable.getUnresolvedSchema().getPrimaryKey(); + assertThat(uniqueConstraintOptional).isPresent(); + assertThat(uniqueConstraintOptional.get().getColumnNames()).containsExactly("key"); + } + + @TestTemplate + public void testCreateTableWithMultiColumnsInPrimaryKey() throws Exception { + sql( + "CREATE TABLE tl(id BIGINT, data STRING, CONSTRAINT pk_constraint PRIMARY KEY(data, id) NOT ENFORCED)"); + + Table table = table("tl"); + assertThat(table.schema().identifierFieldIds()) + .as("Should have the expected RowKey") + .isEqualTo( + Sets.newHashSet( + table.schema().findField("id").fieldId(), + table.schema().findField("data").fieldId())); + CatalogTable catalogTable = catalogTable("tl"); + Optional uniqueConstraintOptional = + catalogTable.getUnresolvedSchema().getPrimaryKey(); + assertThat(uniqueConstraintOptional).isPresent(); + assertThat(uniqueConstraintOptional.get().getColumnNames()).containsExactly("id", "data"); + } + + @TestTemplate + public void testCreateTableIfNotExists() { + sql("CREATE TABLE tl(id BIGINT)"); + + // Assert that table does exist. + assertThat(table("tl")).isNotNull(); + + sql("DROP TABLE tl"); + assertThatThrownBy(() -> table("tl")) + .isInstanceOf(NoSuchTableException.class) + .hasMessage("Table does not exist: " + getFullQualifiedTableName("tl")); + + sql("CREATE TABLE IF NOT EXISTS tl(id BIGINT)"); + assertThat(table("tl").properties()).doesNotContainKey("key"); + + table("tl").updateProperties().set("key", "value").commit(); + assertThat(table("tl").properties()).containsEntry("key", "value"); + + sql("CREATE TABLE IF NOT EXISTS tl(id BIGINT)"); + assertThat(table("tl").properties()).containsEntry("key", "value"); + } + + @TestTemplate + public void testCreateTableLike() throws TableNotExistException { + sql("CREATE TABLE tl(id BIGINT)"); + sql("CREATE TABLE tl2 LIKE tl"); + + Table table = table("tl2"); + assertThat(table.schema().asStruct()) + .isEqualTo( + new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); + CatalogTable catalogTable = catalogTable("tl2"); + assertThat(catalogTable.getUnresolvedSchema()) + .isEqualTo( + org.apache.flink.table.api.Schema.newBuilder() + .column("id", DataTypes.BIGINT()) + .build()); + } + + @TestTemplate + public void testCreateTableLikeInDiffIcebergCatalog() throws TableNotExistException { + sql("CREATE TABLE tl(id BIGINT)"); + + String catalog2 = catalogName + "2"; + sql("CREATE CATALOG %s WITH %s", catalog2, toWithClause(config)); + sql("CREATE DATABASE %s", catalog2 + ".testdb"); + sql("CREATE TABLE %s LIKE tl", catalog2 + ".testdb.tl2"); + + CatalogTable catalogTable = catalogTable(catalog2, "testdb", "tl2"); + assertThat(catalogTable.getUnresolvedSchema()) + .isEqualTo( + org.apache.flink.table.api.Schema.newBuilder() + .column("id", DataTypes.BIGINT()) + .build()); + + dropCatalog(catalog2, true); + } + + @TestTemplate + public void testCreateTableLikeInFlinkCatalog() throws TableNotExistException { + sql("CREATE TABLE tl(id BIGINT)"); + + sql("CREATE TABLE `default_catalog`.`default_database`.tl2 LIKE tl"); + + CatalogTable catalogTable = catalogTable("default_catalog", "default_database", "tl2"); + assertThat(catalogTable.getUnresolvedSchema()) + .isEqualTo( + org.apache.flink.table.api.Schema.newBuilder() + .column("id", DataTypes.BIGINT()) + .build()); + + // `type` option is filtered out by Flink + // https://github.com/apache/flink/blob/edc3d68736de73665440f4313ddcfd9142d8d42b/flink-table/flink-table-common/src/main/java/org/apache/flink/table/factories/FactoryUtil.java#L378 + Map filteredOptions = Maps.newHashMap(config); + filteredOptions.remove(CommonCatalogOptions.CATALOG_TYPE.key()); + + String srcCatalogProps = + FlinkCreateTableOptions.toJson(catalogName, DATABASE, "tl", filteredOptions); + Map options = catalogTable.getOptions(); + assertThat(options) + .containsEntry( + FlinkCreateTableOptions.CONNECTOR_PROPS_KEY, + FlinkDynamicTableFactory.FACTORY_IDENTIFIER) + .containsEntry(FlinkCreateTableOptions.SRC_CATALOG_PROPS_KEY, srcCatalogProps); + } + + @TestTemplate + public void testCreateTableLocation() { + assumeThat(isHadoopCatalog) + .as("HadoopCatalog does not support creating table with location") + .isFalse(); + sql("CREATE TABLE tl(id BIGINT) WITH ('location'='file:///tmp/location')"); + + Table table = table("tl"); + assertThat(table.schema().asStruct()) + .isEqualTo( + new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); + assertThat(table.location()).isEqualTo("file:///tmp/location"); + } + + @TestTemplate + public void testCreatePartitionTable() throws TableNotExistException { + sql("CREATE TABLE tl(id BIGINT, dt STRING) PARTITIONED BY(dt)"); + + Table table = table("tl"); + assertThat(table.schema().asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + assertThat(table.spec()) + .isEqualTo(PartitionSpec.builderFor(table.schema()).identity("dt").build()); + CatalogTable catalogTable = catalogTable("tl"); + assertThat(catalogTable.getUnresolvedSchema()) + .isEqualTo( + org.apache.flink.table.api.Schema.newBuilder() + .column("id", DataTypes.BIGINT()) + .column("dt", DataTypes.STRING()) + .build()); + assertThat(catalogTable.getPartitionKeys()).isEqualTo(Collections.singletonList("dt")); + } + + @TestTemplate + public void testCreateTableWithColumnComment() { + sql("CREATE TABLE tl(id BIGINT COMMENT 'comment - id', data STRING COMMENT 'comment - data')"); + + Table table = table("tl"); + assertThat(table.schema().asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get(), "comment - id"), + Types.NestedField.optional(2, "data", Types.StringType.get(), "comment - data")) + .asStruct()); + } + + @TestTemplate + public void testCreateTableWithFormatV2ThroughTableProperty() throws Exception { + sql("CREATE TABLE tl(id BIGINT) WITH ('format-version'='2')"); + + Table table = table("tl"); + assertThat(((BaseTable) table).operations().current().formatVersion()).isEqualTo(2); + } + + @TestTemplate + public void testUpgradeTableWithFormatV2ThroughTableProperty() throws Exception { + sql("CREATE TABLE tl(id BIGINT) WITH ('format-version'='1')"); + + Table table = table("tl"); + TableOperations ops = ((BaseTable) table).operations(); + assertThat(ops.refresh().formatVersion()) + .as("should create table using format v1") + .isEqualTo(1); + sql("ALTER TABLE tl SET('format-version'='2')"); + assertThat(ops.refresh().formatVersion()) + .as("should update table to use format v2") + .isEqualTo(2); + } + + @TestTemplate + public void testDowngradeTableToFormatV1ThroughTablePropertyFails() throws Exception { + sql("CREATE TABLE tl(id BIGINT) WITH ('format-version'='2')"); + + Table table = table("tl"); + TableOperations ops = ((BaseTable) table).operations(); + assertThat(ops.refresh().formatVersion()) + .as("should create table using format v2") + .isEqualTo(2); + assertThatThrownBy(() -> sql("ALTER TABLE tl SET('format-version'='1')")) + .rootCause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot downgrade v2 table to v1"); + } + + @TestTemplate + public void testLoadTransformPartitionTable() throws TableNotExistException { + Schema schema = new Schema(Types.NestedField.optional(0, "id", Types.LongType.get())); + validationCatalog.createTable( + TableIdentifier.of(icebergNamespace, "tl"), + schema, + PartitionSpec.builderFor(schema).bucket("id", 100).build()); + + CatalogTable catalogTable = catalogTable("tl"); + assertThat(catalogTable.getUnresolvedSchema()) + .isEqualTo( + org.apache.flink.table.api.Schema.newBuilder() + .column("id", DataTypes.BIGINT()) + .build()); + assertThat(catalogTable.getPartitionKeys()).isEmpty(); + } + + @TestTemplate + public void testAlterTableProperties() throws TableNotExistException { + sql("CREATE TABLE tl(id BIGINT) WITH ('oldK'='oldV')"); + Map properties = Maps.newHashMap(); + properties.put("oldK", "oldV"); + + // new + sql("ALTER TABLE tl SET('newK'='newV')"); + properties.put("newK", "newV"); + assertThat(table("tl").properties()).containsAllEntriesOf(properties); + + // update old + sql("ALTER TABLE tl SET('oldK'='oldV2')"); + properties.put("oldK", "oldV2"); + assertThat(table("tl").properties()).containsAllEntriesOf(properties); + + // remove property + sql("ALTER TABLE tl RESET('oldK')"); + properties.remove("oldK"); + assertThat(table("tl").properties()).containsAllEntriesOf(properties); + } + + @TestTemplate + public void testAlterTableAddColumn() { + sql("CREATE TABLE tl(id BIGINT)"); + Schema schemaBefore = table("tl").schema(); + assertThat(schemaBefore.asStruct()) + .isEqualTo( + new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); + sql("ALTER TABLE tl ADD (dt STRING)"); + Schema schemaAfter1 = table("tl").schema(); + assertThat(schemaAfter1.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + // Add multiple columns + sql("ALTER TABLE tl ADD (col1 STRING COMMENT 'comment for col1', col2 BIGINT)"); + Schema schemaAfter2 = table("tl").schema(); + assertThat(schemaAfter2.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get()), + Types.NestedField.optional( + 3, "col1", Types.StringType.get(), "comment for col1"), + Types.NestedField.optional(4, "col2", Types.LongType.get())) + .asStruct()); + + // Adding an existing field should fail due to Flink's internal validation. + assertThatThrownBy(() -> sql("ALTER TABLE tl ADD (id STRING)")) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Try to add a column `id` which already exists in the table."); + } + + @TestTemplate + public void testAlterTableDropColumn() { + sql("CREATE TABLE tl(id BIGINT, dt STRING, col1 STRING, col2 BIGINT)"); + Schema schemaBefore = table("tl").schema(); + assertThat(schemaBefore.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get()), + Types.NestedField.optional(3, "col1", Types.StringType.get()), + Types.NestedField.optional(4, "col2", Types.LongType.get())) + .asStruct()); + sql("ALTER TABLE tl DROP (dt)"); + Schema schemaAfter1 = table("tl").schema(); + assertThat(schemaAfter1.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(3, "col1", Types.StringType.get()), + Types.NestedField.optional(4, "col2", Types.LongType.get())) + .asStruct()); + // Drop multiple columns + sql("ALTER TABLE tl DROP (col1, col2)"); + Schema schemaAfter2 = table("tl").schema(); + assertThat(schemaAfter2.asStruct()) + .isEqualTo( + new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); + // Dropping an non-existing field should fail due to Flink's internal validation. + assertThatThrownBy(() -> sql("ALTER TABLE tl DROP (foo)")) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("The column `foo` does not exist in the base table."); + + // Dropping an already-deleted field should fail due to Flink's internal validation. + assertThatThrownBy(() -> sql("ALTER TABLE tl DROP (dt)")) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("The column `dt` does not exist in the base table."); + } + + @TestTemplate + public void testAlterTableModifyColumnName() { + sql("CREATE TABLE tl(id BIGINT, dt STRING)"); + Schema schemaBefore = table("tl").schema(); + assertThat(schemaBefore.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + sql("ALTER TABLE tl RENAME dt TO data"); + Schema schemaAfter = table("tl").schema(); + assertThat(schemaAfter.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())) + .asStruct()); + } + + @TestTemplate + public void testAlterTableModifyColumnType() { + sql("CREATE TABLE tl(id INTEGER, dt STRING)"); + Schema schemaBefore = table("tl").schema(); + assertThat(schemaBefore.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + // Promote type from Integer to Long + sql("ALTER TABLE tl MODIFY (id BIGINT)"); + Schema schemaAfter = table("tl").schema(); + assertThat(schemaAfter.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + // Type change that doesn't follow the type-promotion rule should fail due to Iceberg's + // validation. + assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (dt INTEGER)")) + .isInstanceOf(TableException.class) + .hasMessageContaining("Could not execute AlterTable") + .hasRootCauseInstanceOf(IllegalArgumentException.class) + .hasRootCauseMessage("Cannot change column type: dt: string -> int"); + } + + @TestTemplate + public void testAlterTableModifyColumnNullability() { + sql("CREATE TABLE tl(id INTEGER NOT NULL, dt STRING)"); + Schema schemaBefore = table("tl").schema(); + assertThat(schemaBefore.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + + // Set nullability from required to optional + sql("ALTER TABLE tl MODIFY (id INTEGER)"); + Schema schemaAfter = table("tl").schema(); + assertThat(schemaAfter.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + } + + @TestTemplate + public void testAlterTableModifyColumnPosition() { + sql("CREATE TABLE tl(id BIGINT, dt STRING)"); + Schema schemaBefore = table("tl").schema(); + assertThat(schemaBefore.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + + sql("ALTER TABLE tl MODIFY (dt STRING FIRST)"); + Schema schemaAfter = table("tl").schema(); + assertThat(schemaAfter.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(2, "dt", Types.StringType.get()), + Types.NestedField.optional(1, "id", Types.LongType.get())) + .asStruct()); + + sql("ALTER TABLE tl MODIFY (dt STRING AFTER id)"); + Schema schemaAfterAfter = table("tl").schema(); + assertThat(schemaAfterAfter.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + // Modifying the position of a non-existing column should fail due to Flink's internal + // validation. + assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (non_existing STRING FIRST)")) + .isInstanceOf(ValidationException.class) + .hasMessageContaining( + "Try to modify a column `non_existing` which does not exist in the table."); + + // Moving a column after a non-existing column should fail due to Flink's internal validation. + assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (dt STRING AFTER non_existing)")) + .isInstanceOf(ValidationException.class) + .hasMessageContaining( + "Referenced column `non_existing` by 'AFTER' does not exist in the table."); + } + + @TestTemplate + public void testAlterTableModifyColumnComment() { + sql("CREATE TABLE tl(id BIGINT, dt STRING)"); + Schema schemaBefore = table("tl").schema(); + assertThat(schemaBefore.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + + sql("ALTER TABLE tl MODIFY (dt STRING COMMENT 'comment for dt field')"); + Schema schemaAfter = table("tl").schema(); + assertThat(schemaAfter.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional( + 2, "dt", Types.StringType.get(), "comment for dt field")) + .asStruct()); + } + + @TestTemplate + public void testAlterTableConstraint() { + sql("CREATE TABLE tl(id BIGINT NOT NULL, dt STRING NOT NULL, col1 STRING)"); + Schema schemaBefore = table("tl").schema(); + assertThat(schemaBefore.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.required(2, "dt", Types.StringType.get()), + Types.NestedField.optional(3, "col1", Types.StringType.get())) + .asStruct()); + assertThat(schemaBefore.identifierFieldNames()).isEmpty(); + sql("ALTER TABLE tl ADD (PRIMARY KEY (id) NOT ENFORCED)"); + Schema schemaAfterAdd = table("tl").schema(); + assertThat(schemaAfterAdd.identifierFieldNames()).containsExactly("id"); + sql("ALTER TABLE tl MODIFY (PRIMARY KEY (dt) NOT ENFORCED)"); + Schema schemaAfterModify = table("tl").schema(); + assertThat(schemaAfterModify.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.required(2, "dt", Types.StringType.get()), + Types.NestedField.optional(3, "col1", Types.StringType.get())) + .asStruct()); + assertThat(schemaAfterModify.identifierFieldNames()).containsExactly("dt"); + // Composite primary key + sql("ALTER TABLE tl MODIFY (PRIMARY KEY (id, dt) NOT ENFORCED)"); + Schema schemaAfterComposite = table("tl").schema(); + assertThat(schemaAfterComposite.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.required(2, "dt", Types.StringType.get()), + Types.NestedField.optional(3, "col1", Types.StringType.get())) + .asStruct()); + assertThat(schemaAfterComposite.identifierFieldNames()).containsExactlyInAnyOrder("id", "dt"); + // Setting an optional field as primary key should fail + // because Iceberg's SchemaUpdate does not allow incompatible changes. + assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (PRIMARY KEY (col1) NOT ENFORCED)")) + .isInstanceOf(TableException.class) + .hasMessageContaining("Could not execute AlterTable") + .hasRootCauseInstanceOf(IllegalArgumentException.class) + .hasRootCauseMessage("Cannot add field col1 as an identifier field: not a required field"); + + // Setting a composite key containing an optional field should fail + // because Iceberg's SchemaUpdate does not allow incompatible changes. + assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (PRIMARY KEY (id, col1) NOT ENFORCED)")) + .isInstanceOf(TableException.class) + .hasMessageContaining("Could not execute AlterTable") + .hasRootCauseInstanceOf(IllegalArgumentException.class) + .hasRootCauseMessage("Cannot add field col1 as an identifier field: not a required field"); + + // Dropping constraints is not supported yet + assertThatThrownBy(() -> sql("ALTER TABLE tl DROP PRIMARY KEY")) + .isInstanceOf(TableException.class) + .hasMessageContaining("Could not execute AlterTable") + .hasRootCauseInstanceOf(UnsupportedOperationException.class) + .hasRootCauseMessage("Unsupported table change: DropConstraint."); + } + + @TestTemplate + public void testRelocateTable() { + assumeThat(isHadoopCatalog).as("HadoopCatalog does not support relocate table").isFalse(); + sql("CREATE TABLE tl(id BIGINT)"); + sql("ALTER TABLE tl SET('location'='file:///tmp/location')"); + assertThat(table("tl").location()).isEqualTo("file:///tmp/location"); + } + + @TestTemplate + public void testSetCurrentAndCherryPickSnapshotId() { + sql("CREATE TABLE tl(c1 INT, c2 STRING, c3 STRING) PARTITIONED BY (c1)"); + + Table table = table("tl"); + + DataFile fileA = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + DataFile fileB = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=1") // easy way to set partition data for now + .withRecordCount(1) + .build(); + DataFile replacementFile = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-a-replacement.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + + table.newAppend().appendFile(fileA).commit(); + long snapshotId = table.currentSnapshot().snapshotId(); + + // stage an overwrite that replaces FILE_A + table.newReplacePartitions().addFile(replacementFile).stageOnly().commit(); + + Snapshot staged = Iterables.getLast(table.snapshots()); + assertThat(staged.operation()) + .as("Should find the staged overwrite snapshot") + .isEqualTo(DataOperations.OVERWRITE); + // add another append so that the original commit can't be fast-forwarded + table.newAppend().appendFile(fileB).commit(); + + // test cherry pick + sql("ALTER TABLE tl SET('cherry-pick-snapshot-id'='%s')", staged.snapshotId()); + validateTableFiles(table, fileB, replacementFile); + + // test set current snapshot + sql("ALTER TABLE tl SET('current-snapshot-id'='%s')", snapshotId); + validateTableFiles(table, fileA); + } + + private void validateTableFiles(Table tbl, DataFile... expectedFiles) { + tbl.refresh(); + Set expectedFilePaths = + Arrays.stream(expectedFiles).map(DataFile::location).collect(Collectors.toSet()); + Set actualFilePaths = + StreamSupport.stream(tbl.newScan().planFiles().spliterator(), false) + .map(FileScanTask::file) + .map(ContentFile::location) + .collect(Collectors.toSet()); + assertThat(actualFilePaths).as("Files should match").isEqualTo(expectedFilePaths); + } + + private Table table(String name) { + return validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, name)); + } + + private CatalogTable catalogTable(String name) throws TableNotExistException { + return catalogTable(getTableEnv().getCurrentCatalog(), DATABASE, name); + } + + private CatalogTable catalogTable(String catalog, String database, String table) + throws TableNotExistException { + return (CatalogTable) + getTableEnv().getCatalog(catalog).get().getTable(new ObjectPath(database, table)); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java new file mode 100644 index 000000000000..e69e1ac4d713 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.List; +import org.apache.flink.table.catalog.CatalogPartitionSpec; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; + +public class TestFlinkCatalogTablePartitions extends CatalogTestBase { + + private final String tableName = "test_table"; + + @Parameter(index = 2) + private FileFormat format; + + @Parameter(index = 3) + private Boolean cacheEnabled; + + @Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, cacheEnabled={3}") + protected static List parameters() { + List parameters = Lists.newArrayList(); + for (FileFormat format : + new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { + for (Boolean cacheEnabled : new Boolean[] {true, false}) { + for (Object[] catalogParams : CatalogTestBase.parameters()) { + String catalogName = (String) catalogParams[0]; + Namespace baseNamespace = (Namespace) catalogParams[1]; + parameters.add(new Object[] {catalogName, baseNamespace, format, cacheEnabled}); + } + } + } + return parameters; + } + + @Override + @BeforeEach + public void before() { + super.before(); + config.put(CatalogProperties.CACHE_ENABLED, String.valueOf(cacheEnabled)); + sql("CREATE DATABASE %s", flinkDatabase); + sql("USE CATALOG %s", catalogName); + sql("USE %s", DATABASE); + } + + @AfterEach + public void cleanNamespaces() { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); + dropDatabase(flinkDatabase, true); + super.clean(); + } + + @TestTemplate + public void testListPartitionsWithUnpartitionedTable() { + sql( + "CREATE TABLE %s (id INT, data VARCHAR) with ('write.format.default'='%s')", + tableName, format.name()); + sql("INSERT INTO %s SELECT 1,'a'", tableName); + + ObjectPath objectPath = new ObjectPath(DATABASE, tableName); + FlinkCatalog flinkCatalog = (FlinkCatalog) getTableEnv().getCatalog(catalogName).get(); + assertThatThrownBy(() -> flinkCatalog.listPartitions(objectPath)) + .isInstanceOf(TableNotPartitionedException.class) + .hasMessageStartingWith("Table db.test_table in catalog") + .hasMessageEndingWith("is not partitioned."); + } + + @TestTemplate + public void testListPartitionsWithPartitionedTable() + throws TableNotExistException, TableNotPartitionedException { + sql( + "CREATE TABLE %s (id INT, data VARCHAR) PARTITIONED BY (data) " + + "with ('write.format.default'='%s')", + tableName, format.name()); + sql("INSERT INTO %s SELECT 1,'a'", tableName); + sql("INSERT INTO %s SELECT 2,'b'", tableName); + + ObjectPath objectPath = new ObjectPath(DATABASE, tableName); + FlinkCatalog flinkCatalog = (FlinkCatalog) getTableEnv().getCatalog(catalogName).get(); + List list = flinkCatalog.listPartitions(objectPath); + assertThat(list).hasSize(2); + List expected = Lists.newArrayList(); + CatalogPartitionSpec partitionSpec1 = new CatalogPartitionSpec(ImmutableMap.of("data", "a")); + CatalogPartitionSpec partitionSpec2 = new CatalogPartitionSpec(ImmutableMap.of("data", "b")); + expected.add(partitionSpec1); + expected.add(partitionSpec2); + assertThat(list).as("Should produce the expected catalog partition specs.").isEqualTo(expected); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java new file mode 100644 index 000000000000..4b6ac25ab8e3 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.time.Duration; +import java.util.Map; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.Test; + +public class TestFlinkConfParser { + + @Test + public void testDurationConf() { + Map writeOptions = ImmutableMap.of("write-prop", "111s"); + + ConfigOption configOption = + ConfigOptions.key("conf-prop").durationType().noDefaultValue(); + Configuration flinkConf = new Configuration(); + flinkConf.setString(configOption.key(), "222s"); + + Table table = mock(Table.class); + when(table.properties()).thenReturn(ImmutableMap.of("table-prop", "333s")); + + FlinkConfParser confParser = new FlinkConfParser(table, writeOptions, flinkConf); + Duration defaultVal = Duration.ofMillis(999); + + Duration result = + confParser.durationConf().option("write-prop").defaultValue(defaultVal).parse(); + assertThat(result).isEqualTo(Duration.ofSeconds(111)); + + result = confParser.durationConf().flinkConfig(configOption).defaultValue(defaultVal).parse(); + assertThat(result).isEqualTo(Duration.ofSeconds(222)); + + result = confParser.durationConf().tableProperty("table-prop").defaultValue(defaultVal).parse(); + assertThat(result).isEqualTo(Duration.ofSeconds(333)); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java new file mode 100644 index 000000000000..59b868ea1ef1 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java @@ -0,0 +1,467 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.Expressions; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.expressions.ApiExpressionUtils; +import org.apache.flink.table.expressions.CallExpression; +import org.apache.flink.table.expressions.Expression; +import org.apache.flink.table.expressions.FieldReferenceExpression; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.flink.table.expressions.UnresolvedCallExpression; +import org.apache.flink.table.expressions.UnresolvedReferenceExpression; +import org.apache.flink.table.expressions.ValueLiteralExpression; +import org.apache.flink.table.expressions.utils.ApiExpressionDefaultVisitor; +import org.apache.flink.table.functions.BuiltInFunctionDefinitions; +import org.apache.iceberg.expressions.And; +import org.apache.iceberg.expressions.BoundLiteralPredicate; +import org.apache.iceberg.expressions.Not; +import org.apache.iceberg.expressions.Or; +import org.apache.iceberg.expressions.UnboundPredicate; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.util.DateTimeUtil; +import org.apache.iceberg.util.Pair; +import org.junit.jupiter.api.Test; + +public class TestFlinkFilters { + + private static final ResolvedSchema RESOLVED_SCHEMA = + ResolvedSchema.of( + Column.physical("field1", DataTypes.INT()), + Column.physical("field2", DataTypes.BIGINT()), + Column.physical("field3", DataTypes.FLOAT()), + Column.physical("field4", DataTypes.DOUBLE()), + Column.physical("field5", DataTypes.STRING()), + Column.physical("field6", DataTypes.BOOLEAN()), + Column.physical("field7", DataTypes.BINARY(2)), + Column.physical("field8", DataTypes.DECIMAL(10, 2)), + Column.physical("field9", DataTypes.DATE()), + Column.physical("field10", DataTypes.TIME()), + Column.physical("field11", DataTypes.TIMESTAMP()), + Column.physical("field12", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE())); + + // A map list of fields and values used to verify the conversion of flink expression to iceberg + // expression + private static final List> FIELD_VALUE_LIST = + ImmutableList.of( + Pair.of("field1", 1), + Pair.of("field2", 2L), + Pair.of("field3", 3F), + Pair.of("field4", 4D), + Pair.of("field5", "iceberg"), + Pair.of("field6", true), + Pair.of("field7", new byte[] {'a', 'b'}), + Pair.of("field8", BigDecimal.valueOf(10.12)), + Pair.of("field9", DateTimeUtil.daysFromDate(LocalDate.now())), + Pair.of("field10", DateTimeUtil.microsFromTime(LocalTime.now())), + Pair.of("field11", DateTimeUtil.microsFromTimestamp(LocalDateTime.now())), + Pair.of("field12", DateTimeUtil.microsFromInstant(Instant.now()))); + + @Test + public void testFlinkDataTypeEqual() { + matchLiteral("field1", 1, 1); + matchLiteral("field2", 10L, 10L); + matchLiteral("field3", 1.2F, 1.2F); + matchLiteral("field4", 3.4D, 3.4D); + matchLiteral("field5", "abcd", "abcd"); + matchLiteral("field6", true, true); + matchLiteral("field7", new byte[] {'a', 'b'}, ByteBuffer.wrap(new byte[] {'a', 'b'})); + matchLiteral("field8", BigDecimal.valueOf(10.12), BigDecimal.valueOf(10.12)); + + LocalDate date = LocalDate.parse("2020-12-23"); + matchLiteral("field9", date, DateTimeUtil.daysFromDate(date)); + + LocalTime time = LocalTime.parse("12:13:14"); + matchLiteral("field10", time, DateTimeUtil.microsFromTime(time)); + + LocalDateTime dateTime = LocalDateTime.parse("2020-12-23T12:13:14"); + matchLiteral("field11", dateTime, DateTimeUtil.microsFromTimestamp(dateTime)); + + Instant instant = Instant.parse("2020-12-23T12:13:14.00Z"); + matchLiteral("field12", instant, DateTimeUtil.microsFromInstant(instant)); + } + + @Test + public void testEquals() { + for (Pair pair : FIELD_VALUE_LIST) { + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.equal(pair.first(), pair.second()); + + Optional actual = + FlinkFilters.convert( + resolve(Expressions.$(pair.first()).isEqual(Expressions.lit(pair.second())))); + assertThat(actual).isPresent(); + assertPredicatesMatch(expected, actual.get()); + + Optional actual1 = + FlinkFilters.convert( + resolve(Expressions.lit(pair.second()).isEqual(Expressions.$(pair.first())))); + assertThat(actual1).isPresent(); + assertPredicatesMatch(expected, actual1.get()); + } + } + + @Test + public void testEqualsNaN() { + UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.isNaN("field3"); + + Optional actual = + FlinkFilters.convert(resolve(Expressions.$("field3").isEqual(Expressions.lit(Float.NaN)))); + assertThat(actual).isPresent(); + assertPredicatesMatch(expected, actual.get()); + + Optional actual1 = + FlinkFilters.convert(resolve(Expressions.lit(Float.NaN).isEqual(Expressions.$("field3")))); + assertThat(actual1).isPresent(); + assertPredicatesMatch(expected, actual1.get()); + } + + @Test + public void testNotEquals() { + for (Pair pair : FIELD_VALUE_LIST) { + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.notEqual(pair.first(), pair.second()); + + Optional actual = + FlinkFilters.convert( + resolve(Expressions.$(pair.first()).isNotEqual(Expressions.lit(pair.second())))); + assertThat(actual).isPresent(); + assertPredicatesMatch(expected, actual.get()); + + Optional actual1 = + FlinkFilters.convert( + resolve(Expressions.lit(pair.second()).isNotEqual(Expressions.$(pair.first())))); + assertThat(actual1).isPresent(); + assertPredicatesMatch(expected, actual1.get()); + } + } + + @Test + public void testNotEqualsNaN() { + UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.notNaN("field3"); + + Optional actual = + FlinkFilters.convert( + resolve(Expressions.$("field3").isNotEqual(Expressions.lit(Float.NaN)))); + assertThat(actual).isPresent(); + assertPredicatesMatch(expected, actual.get()); + + Optional actual1 = + FlinkFilters.convert( + resolve(Expressions.lit(Float.NaN).isNotEqual(Expressions.$("field3")))); + assertThat(actual1).isPresent(); + assertPredicatesMatch(expected, actual1.get()); + } + + @Test + public void testGreaterThan() { + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.greaterThan("field1", 1); + + Optional actual = + FlinkFilters.convert(resolve(Expressions.$("field1").isGreater(Expressions.lit(1)))); + assertThat(actual).isPresent(); + assertPredicatesMatch(expected, actual.get()); + + Optional actual1 = + FlinkFilters.convert(resolve(Expressions.lit(1).isLess(Expressions.$("field1")))); + assertThat(actual1).isPresent(); + assertPredicatesMatch(expected, actual1.get()); + } + + @Test + public void testGreaterThanEquals() { + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.greaterThanOrEqual("field1", 1); + + Optional actual = + FlinkFilters.convert(resolve(Expressions.$("field1").isGreaterOrEqual(Expressions.lit(1)))); + assertThat(actual).isPresent(); + assertPredicatesMatch(expected, actual.get()); + + Optional actual1 = + FlinkFilters.convert(resolve(Expressions.lit(1).isLessOrEqual(Expressions.$("field1")))); + assertThat(actual1).isPresent(); + assertPredicatesMatch(expected, actual1.get()); + } + + @Test + public void testLessThan() { + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.lessThan("field1", 1); + + Optional actual = + FlinkFilters.convert(resolve(Expressions.$("field1").isLess(Expressions.lit(1)))); + assertThat(actual).isPresent(); + assertPredicatesMatch(expected, actual.get()); + + Optional actual1 = + FlinkFilters.convert(resolve(Expressions.lit(1).isGreater(Expressions.$("field1")))); + assertThat(actual1).isPresent(); + assertPredicatesMatch(expected, actual1.get()); + } + + @Test + public void testLessThanEquals() { + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.lessThanOrEqual("field1", 1); + + Optional actual = + FlinkFilters.convert(resolve(Expressions.$("field1").isLessOrEqual(Expressions.lit(1)))); + assertThat(actual).isPresent(); + assertPredicatesMatch(expected, actual.get()); + + Optional actual1 = + FlinkFilters.convert(resolve(Expressions.lit(1).isGreaterOrEqual(Expressions.$("field1")))); + assertThat(actual1).isPresent(); + assertPredicatesMatch(expected, actual1.get()); + } + + @Test + public void testIsNull() { + Expression expr = resolve(Expressions.$("field1").isNull()); + Optional actual = FlinkFilters.convert(expr); + assertThat(actual).isPresent(); + UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.isNull("field1"); + assertPredicatesMatch(expected, actual.get()); + } + + @Test + public void testIsNotNull() { + Expression expr = resolve(Expressions.$("field1").isNotNull()); + Optional actual = FlinkFilters.convert(expr); + assertThat(actual).isPresent(); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.notNull("field1"); + assertPredicatesMatch(expected, actual.get()); + } + + @Test + public void testAnd() { + Expression expr = + resolve( + Expressions.$("field1") + .isEqual(Expressions.lit(1)) + .and(Expressions.$("field2").isEqual(Expressions.lit(2L)))); + Optional actual = FlinkFilters.convert(expr); + assertThat(actual).isPresent(); + And and = (And) actual.get(); + And expected = + (And) + org.apache.iceberg.expressions.Expressions.and( + org.apache.iceberg.expressions.Expressions.equal("field1", 1), + org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); + + assertPredicatesMatch(expected.left(), and.left()); + assertPredicatesMatch(expected.right(), and.right()); + } + + @Test + public void testOr() { + Expression expr = + resolve( + Expressions.$("field1") + .isEqual(Expressions.lit(1)) + .or(Expressions.$("field2").isEqual(Expressions.lit(2L)))); + Optional actual = FlinkFilters.convert(expr); + assertThat(actual).isPresent(); + Or or = (Or) actual.get(); + Or expected = + (Or) + org.apache.iceberg.expressions.Expressions.or( + org.apache.iceberg.expressions.Expressions.equal("field1", 1), + org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); + + assertPredicatesMatch(expected.left(), or.left()); + assertPredicatesMatch(expected.right(), or.right()); + } + + @Test + public void testNot() { + Expression expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.NOT, + Expressions.$("field1").isEqual(Expressions.lit(1)))); + Optional actual = FlinkFilters.convert(expr); + assertThat(actual).isPresent(); + Not not = (Not) actual.get(); + Not expected = + (Not) + org.apache.iceberg.expressions.Expressions.not( + org.apache.iceberg.expressions.Expressions.equal("field1", 1)); + + assertThat(not.op()).as("Predicate operation should match").isEqualTo(expected.op()); + assertPredicatesMatch(expected.child(), not.child()); + } + + @Test + public void testLike() { + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.startsWith("field5", "abc"); + Expression expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("abc%"))); + Optional actual = FlinkFilters.convert(expr); + assertThat(actual).isPresent(); + assertPredicatesMatch(expected, actual.get()); + + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%abc"))); + actual = FlinkFilters.convert(expr); + assertThat(actual).isNotPresent(); + + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, + Expressions.$("field5"), + Expressions.lit("%abc%"))); + actual = FlinkFilters.convert(expr); + assertThat(actual).isNotPresent(); + + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, + Expressions.$("field5"), + Expressions.lit("abc%d"))); + actual = FlinkFilters.convert(expr); + assertThat(actual).isNotPresent(); + + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%"))); + actual = FlinkFilters.convert(expr); + assertThat(actual).isNotPresent(); + + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a_"))); + actual = FlinkFilters.convert(expr); + assertThat(actual).isNotPresent(); + + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a%b"))); + actual = FlinkFilters.convert(expr); + assertThat(actual).isNotPresent(); + } + + @SuppressWarnings("unchecked") + private void matchLiteral(String fieldName, Object flinkLiteral, T icebergLiteral) { + Expression expr = resolve(Expressions.$(fieldName).isEqual(Expressions.lit(flinkLiteral))); + Optional actual = FlinkFilters.convert(expr); + assertThat(actual).isPresent(); + org.apache.iceberg.expressions.Expression expression = actual.get(); + assertThat(expression) + .as("The expression should be a UnboundPredicate") + .isInstanceOf(UnboundPredicate.class); + UnboundPredicate unboundPredicate = (UnboundPredicate) expression; + + org.apache.iceberg.expressions.Expression expression1 = + unboundPredicate.bind(FlinkSchemaUtil.convert(RESOLVED_SCHEMA).asStruct(), false); + assertThat(expression1) + .as("The expression should be a BoundLiteralPredicate") + .isInstanceOf(BoundLiteralPredicate.class); + + BoundLiteralPredicate predicate = (BoundLiteralPredicate) expression1; + assertThat(predicate.test(icebergLiteral)).isTrue(); + } + + private static Expression resolve(Expression originalExpression) { + return originalExpression.accept( + new ApiExpressionDefaultVisitor<>() { + @Override + public Expression visit(UnresolvedReferenceExpression unresolvedReference) { + String name = unresolvedReference.getName(); + return RESOLVED_SCHEMA + .getColumn(name) + .map( + column -> { + int columnIndex = RESOLVED_SCHEMA.getColumns().indexOf(column); + return new FieldReferenceExpression( + name, column.getDataType(), 0, columnIndex); + }) + .orElse(null); + } + + @Override + public Expression visit(UnresolvedCallExpression unresolvedCall) { + List children = + unresolvedCall.getChildren().stream() + .map(e -> (ResolvedExpression) e.accept(this)) + .collect(Collectors.toList()); + return new CallExpression( + false, + unresolvedCall.getFunctionIdentifier().orElse(null), + unresolvedCall.getFunctionDefinition(), + children, + DataTypes.STRING()); + } + + @Override + public Expression visit(ValueLiteralExpression valueLiteral) { + return valueLiteral; + } + + @Override + protected Expression defaultMethod(Expression expression) { + throw new UnsupportedOperationException( + String.format("unsupported expression: %s", expression)); + } + }); + } + + private void assertPredicatesMatch( + org.apache.iceberg.expressions.Expression expected, + org.apache.iceberg.expressions.Expression actual) { + assertThat(expected) + .as("The expected expression should be a UnboundPredicate") + .isInstanceOf(UnboundPredicate.class); + assertThat(actual) + .as("The actual expression should be a UnboundPredicate") + .isInstanceOf(UnboundPredicate.class); + UnboundPredicate predicateExpected = (UnboundPredicate) expected; + UnboundPredicate predicateActual = (UnboundPredicate) actual; + assertThat(predicateActual.op()).isEqualTo(predicateExpected.op()); + assertThat(predicateActual.literal()).isEqualTo(predicateExpected.literal()); + assertThat(predicateActual.ref().name()).isEqualTo(predicateExpected.ref().name()); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java new file mode 100644 index 000000000000..91343ab1ee72 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Map; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.Test; + +public class TestFlinkHiveCatalog extends TestBase { + + @Test + public void testCreateCatalogWithWarehouseLocation() throws IOException { + Map props = Maps.newHashMap(); + props.put("type", "iceberg"); + props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hive"); + props.put(CatalogProperties.URI, CatalogTestBase.getURI(hiveConf)); + + File warehouseDir = Files.createTempDirectory(temporaryDirectory, "junit").toFile(); + props.put(CatalogProperties.WAREHOUSE_LOCATION, "file://" + warehouseDir.getAbsolutePath()); + + checkSQLQuery(props, warehouseDir); + } + + @Test + public void testCreateCatalogWithHiveConfDir() throws IOException { + // Dump the hive conf into a local file. + File hiveConfDir = Files.createTempDirectory(temporaryDirectory, "junit").toFile(); + File hiveSiteXML = new File(hiveConfDir, "hive-site.xml"); + File warehouseDir = Files.createTempDirectory(temporaryDirectory, "junit").toFile(); + try (FileOutputStream fos = new FileOutputStream(hiveSiteXML)) { + Configuration newConf = new Configuration(hiveConf); + // Set another new directory which is different with the hive metastore's warehouse path. + newConf.set( + HiveConf.ConfVars.METASTOREWAREHOUSE.varname, "file://" + warehouseDir.getAbsolutePath()); + newConf.writeXml(fos); + } + assertThat(hiveSiteXML.toPath()).exists(); + + // Construct the catalog attributions. + Map props = Maps.newHashMap(); + props.put("type", "iceberg"); + props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hive"); + props.put(CatalogProperties.URI, CatalogTestBase.getURI(hiveConf)); + // Set the 'hive-conf-dir' instead of 'warehouse' + props.put(FlinkCatalogFactory.HIVE_CONF_DIR, hiveConfDir.getAbsolutePath()); + + checkSQLQuery(props, warehouseDir); + } + + private void checkSQLQuery(Map catalogProperties, File warehouseDir) + throws IOException { + sql("CREATE CATALOG test_catalog WITH %s", CatalogTestBase.toWithClause(catalogProperties)); + sql("USE CATALOG test_catalog"); + sql("CREATE DATABASE test_db"); + sql("USE test_db"); + sql("CREATE TABLE test_table(c1 INT, c2 STRING)"); + sql("INSERT INTO test_table SELECT 1, 'a'"); + + Path databasePath = warehouseDir.toPath().resolve("test_db.db"); + assertThat(databasePath).exists(); + + Path tablePath = databasePath.resolve("test_table"); + assertThat(tablePath).exists(); + + Path dataPath = tablePath.resolve("data"); + assertThat(dataPath).exists(); + assertThat(Files.list(dataPath).count()) + .as("Should have a .crc file and a .parquet file") + .isEqualTo(2); + + sql("DROP TABLE test_table"); + dropDatabase("test_db", false); + dropCatalog("test_catalog", false); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java new file mode 100644 index 000000000000..ce18a1bb3d50 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java @@ -0,0 +1,474 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.Collections; +import java.util.List; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.ValidationException; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.catalog.UniqueConstraint; +import org.apache.flink.table.legacy.api.TableSchema; +import org.apache.flink.table.types.logical.BinaryType; +import org.apache.flink.table.types.logical.CharType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimeType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.table.types.logical.VarBinaryType; +import org.apache.flink.table.types.logical.VarCharType; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Schema; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestFlinkSchemaUtil { + + @Parameter private boolean isTableSchema; + + @Parameters(name = "isTableSchema={0}") + private static Object[][] parameters() { + return new Object[][] {{true}, {false}}; + } + + @TestTemplate + public void testConvertFlinkSchemaToIcebergSchema() { + ResolvedSchema flinkSchema = + ResolvedSchema.of( + Column.physical("id", DataTypes.INT().notNull()), + Column.physical("name", DataTypes.STRING()) /* optional by default */, + Column.physical("salary", DataTypes.DOUBLE().notNull()), + Column.physical( + "locations", + DataTypes.MAP( + DataTypes.STRING(), + DataTypes.ROW( + DataTypes.FIELD("posX", DataTypes.DOUBLE().notNull(), "X field"), + DataTypes.FIELD("posY", DataTypes.DOUBLE().notNull(), "Y field")))), + Column.physical("strArray", DataTypes.ARRAY(DataTypes.STRING()).nullable()), + Column.physical("intArray", DataTypes.ARRAY(DataTypes.INT()).nullable()), + Column.physical("char", DataTypes.CHAR(10).notNull()), + Column.physical("varchar", DataTypes.VARCHAR(10).notNull()), + Column.physical("boolean", DataTypes.BOOLEAN().nullable()), + Column.physical("tinyint", DataTypes.TINYINT()), + Column.physical("smallint", DataTypes.SMALLINT()), + Column.physical("bigint", DataTypes.BIGINT()), + Column.physical("varbinary", DataTypes.VARBINARY(10)), + Column.physical("binary", DataTypes.BINARY(10)), + Column.physical("time", DataTypes.TIME()), + Column.physical("timestampWithoutZone", DataTypes.TIMESTAMP()), + Column.physical("timestampWithZone", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()), + Column.physical("date", DataTypes.DATE()), + Column.physical("decimal", DataTypes.DECIMAL(2, 2)), + Column.physical("decimal2", DataTypes.DECIMAL(38, 2)), + Column.physical("decimal3", DataTypes.DECIMAL(10, 1)), + Column.physical("multiset", DataTypes.MULTISET(DataTypes.STRING().notNull()))); + + Schema icebergSchema = + new Schema( + Types.NestedField.required(0, "id", Types.IntegerType.get(), null), + Types.NestedField.optional(1, "name", Types.StringType.get(), null), + Types.NestedField.required(2, "salary", Types.DoubleType.get(), null), + Types.NestedField.optional( + 3, + "locations", + Types.MapType.ofOptional( + 24, + 25, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(22, "posX", Types.DoubleType.get(), "X field"), + Types.NestedField.required( + 23, "posY", Types.DoubleType.get(), "Y field")))), + Types.NestedField.optional( + 4, "strArray", Types.ListType.ofOptional(26, Types.StringType.get())), + Types.NestedField.optional( + 5, "intArray", Types.ListType.ofOptional(27, Types.IntegerType.get())), + Types.NestedField.required(6, "char", Types.StringType.get()), + Types.NestedField.required(7, "varchar", Types.StringType.get()), + Types.NestedField.optional(8, "boolean", Types.BooleanType.get()), + Types.NestedField.optional(9, "tinyint", Types.IntegerType.get()), + Types.NestedField.optional(10, "smallint", Types.IntegerType.get()), + Types.NestedField.optional(11, "bigint", Types.LongType.get()), + Types.NestedField.optional(12, "varbinary", Types.BinaryType.get()), + Types.NestedField.optional(13, "binary", Types.FixedType.ofLength(10)), + Types.NestedField.optional(14, "time", Types.TimeType.get()), + Types.NestedField.optional( + 15, "timestampWithoutZone", Types.TimestampType.withoutZone()), + Types.NestedField.optional(16, "timestampWithZone", Types.TimestampType.withZone()), + Types.NestedField.optional(17, "date", Types.DateType.get()), + Types.NestedField.optional(18, "decimal", Types.DecimalType.of(2, 2)), + Types.NestedField.optional(19, "decimal2", Types.DecimalType.of(38, 2)), + Types.NestedField.optional(20, "decimal3", Types.DecimalType.of(10, 1)), + Types.NestedField.optional( + 21, + "multiset", + Types.MapType.ofRequired(28, 29, Types.StringType.get(), Types.IntegerType.get()))); + + checkSchema(flinkSchema, icebergSchema); + } + + @TestTemplate + public void testMapField() { + ResolvedSchema flinkSchema = + ResolvedSchema.of( + Column.physical( + "map_int_long", + DataTypes.MAP(DataTypes.INT(), DataTypes.BIGINT()).notNull()) /* Required */, + Column.physical( + "map_int_array_string", + DataTypes.MAP(DataTypes.ARRAY(DataTypes.INT()), DataTypes.STRING())), + Column.physical( + "map_decimal_string", DataTypes.MAP(DataTypes.DECIMAL(10, 2), DataTypes.STRING())), + Column.physical( + "map_fields_fields", + DataTypes.MAP( + DataTypes.ROW( + DataTypes.FIELD("field_int", DataTypes.INT(), "doc - int"), + DataTypes.FIELD("field_string", DataTypes.STRING(), "doc - string")) + .notNull(), /* Required */ + DataTypes.ROW( + DataTypes.FIELD( + "field_array", + DataTypes.ARRAY(DataTypes.STRING()), + "doc - array")) + .notNull() /* Required */) + .notNull() /* Required */)); + + Schema icebergSchema = + new Schema( + Types.NestedField.required( + 0, + "map_int_long", + Types.MapType.ofOptional(4, 5, Types.IntegerType.get(), Types.LongType.get()), + null), + Types.NestedField.optional( + 1, + "map_int_array_string", + Types.MapType.ofOptional( + 7, + 8, + Types.ListType.ofOptional(6, Types.IntegerType.get()), + Types.StringType.get()), + null), + Types.NestedField.optional( + 2, + "map_decimal_string", + Types.MapType.ofOptional( + 9, 10, Types.DecimalType.of(10, 2), Types.StringType.get())), + Types.NestedField.required( + 3, + "map_fields_fields", + Types.MapType.ofRequired( + 15, + 16, + Types.StructType.of( + Types.NestedField.optional( + 11, "field_int", Types.IntegerType.get(), "doc - int"), + Types.NestedField.optional( + 12, "field_string", Types.StringType.get(), "doc - string")), + Types.StructType.of( + Types.NestedField.optional( + 14, + "field_array", + Types.ListType.ofOptional(13, Types.StringType.get()), + "doc - array"))))); + + checkSchema(flinkSchema, icebergSchema); + } + + @TestTemplate + public void testStructField() { + ResolvedSchema flinkSchema = + ResolvedSchema.of( + Column.physical( + "struct_int_string_decimal", + DataTypes.ROW( + DataTypes.FIELD("field_int", DataTypes.INT()), + DataTypes.FIELD("field_string", DataTypes.STRING()), + DataTypes.FIELD("field_decimal", DataTypes.DECIMAL(19, 2)), + DataTypes.FIELD( + "field_struct", + DataTypes.ROW( + DataTypes.FIELD("inner_struct_int", DataTypes.INT()), + DataTypes.FIELD( + "inner_struct_float_array", + DataTypes.ARRAY(DataTypes.FLOAT()))) + .notNull()) /* Row is required */) + .notNull()) /* Required */, + Column.physical( + "struct_map_int_int", + DataTypes.ROW( + DataTypes.FIELD( + "field_map", DataTypes.MAP(DataTypes.INT(), DataTypes.INT()))) + .nullable()) /* Optional */); + + Schema icebergSchema = + new Schema( + Types.NestedField.required( + 0, + "struct_int_string_decimal", + Types.StructType.of( + Types.NestedField.optional(5, "field_int", Types.IntegerType.get()), + Types.NestedField.optional(6, "field_string", Types.StringType.get()), + Types.NestedField.optional(7, "field_decimal", Types.DecimalType.of(19, 2)), + Types.NestedField.required( + 8, + "field_struct", + Types.StructType.of( + Types.NestedField.optional( + 3, "inner_struct_int", Types.IntegerType.get()), + Types.NestedField.optional( + 4, + "inner_struct_float_array", + Types.ListType.ofOptional(2, Types.FloatType.get())))))), + Types.NestedField.optional( + 1, + "struct_map_int_int", + Types.StructType.of( + Types.NestedField.optional( + 11, + "field_map", + Types.MapType.ofOptional( + 9, 10, Types.IntegerType.get(), Types.IntegerType.get()))))); + + checkSchema(flinkSchema, icebergSchema); + } + + @TestTemplate + public void testListField() { + ResolvedSchema flinkSchema = + ResolvedSchema.of( + Column.physical( + "list_struct_fields", + DataTypes.ARRAY(DataTypes.ROW(DataTypes.FIELD("field_int", DataTypes.INT()))) + .notNull()) /* Required */, + Column.physical( + "list_optional_struct_fields", + DataTypes.ARRAY( + DataTypes.ROW( + DataTypes.FIELD( + "field_timestamp_with_local_time_zone", + DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()))) + .nullable()) /* Optional */, + Column.physical( + "list_map_fields", + DataTypes.ARRAY( + DataTypes.MAP( + DataTypes.ARRAY( + DataTypes.INT().notNull()), /* Key of map must be required */ + DataTypes.ROW( + DataTypes.FIELD("field_0", DataTypes.INT(), "doc - int"))) + .notNull()) + .notNull()) /* Required */); + + Schema icebergSchema = + new Schema( + Types.NestedField.required( + 0, + "list_struct_fields", + Types.ListType.ofOptional( + 4, + Types.StructType.of( + Types.NestedField.optional(3, "field_int", Types.IntegerType.get())))), + Types.NestedField.optional( + 1, + "list_optional_struct_fields", + Types.ListType.ofOptional( + 6, + Types.StructType.of( + Types.NestedField.optional( + 5, + "field_timestamp_with_local_time_zone", + Types.TimestampType.withZone())))), + Types.NestedField.required( + 2, + "list_map_fields", + Types.ListType.ofRequired( + 11, + Types.MapType.ofOptional( + 9, + 10, + Types.ListType.ofRequired(7, Types.IntegerType.get()), + Types.StructType.of( + Types.NestedField.optional( + 8, "field_0", Types.IntegerType.get(), "doc - int")))))); + + checkSchema(flinkSchema, icebergSchema); + } + + private void checkSchema(ResolvedSchema flinkSchema, Schema icebergSchema) { + if (isTableSchema) { + assertThat(FlinkSchemaUtil.convert(TableSchema.fromResolvedSchema(flinkSchema)).asStruct()) + .isEqualTo(icebergSchema.asStruct()); + // The conversion is not a 1:1 mapping, so we just check iceberg types. + assertThat( + FlinkSchemaUtil.convert( + FlinkSchemaUtil.toSchema(FlinkSchemaUtil.convert(icebergSchema))) + .asStruct()) + .isEqualTo(icebergSchema.asStruct()); + } else { + assertThat(FlinkSchemaUtil.convert(flinkSchema).asStruct()) + .isEqualTo(icebergSchema.asStruct()); + // The conversion is not a 1:1 mapping, so we just check iceberg types. + assertThat( + FlinkSchemaUtil.convert( + FlinkSchemaUtil.toResolvedSchema(FlinkSchemaUtil.convert(icebergSchema))) + .asStruct()) + .isEqualTo(icebergSchema.asStruct()); + } + } + + @Test + public void testInconsistentTypes() { + checkInconsistentType( + Types.UUIDType.get(), new BinaryType(16), new BinaryType(16), Types.FixedType.ofLength(16)); + checkInconsistentType( + Types.StringType.get(), + new VarCharType(VarCharType.MAX_LENGTH), + new CharType(100), + Types.StringType.get()); + checkInconsistentType( + Types.BinaryType.get(), + new VarBinaryType(VarBinaryType.MAX_LENGTH), + new VarBinaryType(100), + Types.BinaryType.get()); + checkInconsistentType( + Types.TimeType.get(), new TimeType(), new TimeType(3), Types.TimeType.get()); + checkInconsistentType( + Types.TimestampType.withoutZone(), + new TimestampType(6), + new TimestampType(3), + Types.TimestampType.withoutZone()); + checkInconsistentType( + Types.TimestampType.withZone(), + new LocalZonedTimestampType(6), + new LocalZonedTimestampType(3), + Types.TimestampType.withZone()); + } + + private void checkInconsistentType( + Type icebergType, + LogicalType flinkExpectedType, + LogicalType flinkType, + Type icebergExpectedType) { + assertThat(FlinkSchemaUtil.convert(icebergType)).isEqualTo(flinkExpectedType); + assertThat(FlinkSchemaUtil.convert(FlinkSchemaUtil.toSchema(RowType.of(flinkType))).asStruct()) + .isEqualTo(Types.StructType.of(Types.NestedField.optional(0, "f0", icebergExpectedType))); + assertThat( + FlinkSchemaUtil.convert(FlinkSchemaUtil.toResolvedSchema(RowType.of(flinkType))) + .asStruct()) + .isEqualTo(Types.StructType.of(Types.NestedField.optional(0, "f0", icebergExpectedType))); + } + + @TestTemplate + public void testConvertFlinkSchemaBaseOnIcebergSchema() { + Schema baseSchema = + new Schema( + Lists.newArrayList( + Types.NestedField.required(101, "int", Types.IntegerType.get()), + Types.NestedField.optional(102, "string", Types.StringType.get())), + Sets.newHashSet(101)); + + Schema convertedSchema; + if (isTableSchema) { + TableSchema flinkSchema = + TableSchema.builder() + .field("int", DataTypes.INT().notNull()) + .field("string", DataTypes.STRING().nullable()) + .primaryKey("int") + .build(); + convertedSchema = FlinkSchemaUtil.convert(baseSchema, flinkSchema); + } else { + ResolvedSchema flinkSchema = + new ResolvedSchema( + List.of( + Column.physical("int", DataTypes.INT().notNull()), + Column.physical("string", DataTypes.STRING().nullable())), + Collections.emptyList(), + UniqueConstraint.primaryKey("pk", List.of("int"))); + convertedSchema = FlinkSchemaUtil.convert(baseSchema, flinkSchema); + } + + assertThat(convertedSchema.asStruct()).isEqualTo(baseSchema.asStruct()); + assertThat(convertedSchema.identifierFieldIds()).containsExactly(101); + } + + @TestTemplate + public void testConvertFlinkSchemaWithPrimaryKeys() { + Schema icebergSchema = + new Schema( + Lists.newArrayList( + Types.NestedField.required(1, "int", Types.IntegerType.get()), + Types.NestedField.required(2, "string", Types.StringType.get())), + Sets.newHashSet(1, 2)); + + if (isTableSchema) { + TableSchema tableSchema = FlinkSchemaUtil.toSchema(icebergSchema); + assertThat(tableSchema.getPrimaryKey()) + .isPresent() + .get() + .satisfies(k -> assertThat(k.getColumns()).containsExactly("int", "string")); + } else { + ResolvedSchema resolvedSchema = FlinkSchemaUtil.toResolvedSchema(icebergSchema); + assertThat(resolvedSchema.getPrimaryKey()) + .isPresent() + .get() + .satisfies(k -> assertThat(k.getColumns()).containsExactly("int", "string")); + } + } + + @TestTemplate + public void testConvertFlinkSchemaWithNestedColumnInPrimaryKeys() { + Schema icebergSchema = + new Schema( + Lists.newArrayList( + Types.NestedField.required( + 1, + "struct", + Types.StructType.of( + Types.NestedField.required(2, "inner", Types.IntegerType.get())))), + Sets.newHashSet(2)); + + if (isTableSchema) { + assertThatThrownBy(() -> FlinkSchemaUtil.toSchema(icebergSchema)) + .isInstanceOf(ValidationException.class) + .hasMessageStartingWith("Could not create a PRIMARY KEY") + .hasMessageContaining("Column 'struct.inner' does not exist."); + } else { + assertThatThrownBy(() -> FlinkSchemaUtil.toResolvedSchema(icebergSchema)) + .isInstanceOf(ValidationException.class) + .hasMessageStartingWith("Invalid primary key") + .hasMessageContaining("Column 'struct.inner' does not exist."); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java new file mode 100644 index 000000000000..d99f657a11cc --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java @@ -0,0 +1,266 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.util.List; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.Expressions; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.source.BoundedTableFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; + +public class TestFlinkTableSink extends CatalogTestBase { + + private static final String TABLE_NAME = "test_table"; + private TableEnvironment tEnv; + private Table icebergTable; + + @Parameter(index = 2) + private FileFormat format; + + @Parameter(index = 3) + private boolean isStreamingJob; + + @Parameter(index = 4) + private boolean useV2Sink; + + @Parameters( + name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}, useV2Sink={4}") + public static List parameters() { + List parameters = Lists.newArrayList(); + for (FileFormat format : + new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { + for (Boolean isStreaming : new Boolean[] {true, false}) { + for (Object[] catalogParams : CatalogTestBase.parameters()) { + String catalogName = (String) catalogParams[0]; + Namespace baseNamespace = (Namespace) catalogParams[1]; + parameters.add( + new Object[] { + catalogName, baseNamespace, format, isStreaming, false /* don't use v2 sink */ + }); + } + } + } + + for (FileFormat format : + new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { + for (Boolean isStreaming : new Boolean[] {true, false}) { + String catalogName = "testhadoop_basenamespace"; + Namespace baseNamespace = Namespace.of("l0", "l1"); + parameters.add( + new Object[] {catalogName, baseNamespace, format, isStreaming, true /* use v2 sink */}); + } + } + + return parameters; + } + + @Override + protected TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); + if (isStreamingJob) { + settingsBuilder.inStreamingMode(); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); + env.enableCheckpointing(400); + env.setMaxParallelism(2); + env.setParallelism(2); + tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); + } else { + settingsBuilder.inBatchMode(); + tEnv = TableEnvironment.create(settingsBuilder.build()); + } + } + } + + tEnv.getConfig() + .getConfiguration() + .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_V2_SINK, useV2Sink); + + return tEnv; + } + + @Override + @BeforeEach + public void before() { + super.before(); + sql("CREATE DATABASE %s", flinkDatabase); + sql("USE CATALOG %s", catalogName); + sql("USE %s", DATABASE); + sql( + "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", + TABLE_NAME, format.name()); + icebergTable = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + } + + @Override + @AfterEach + public void clean() { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME); + dropDatabase(flinkDatabase, true); + BoundedTableFactory.clearDataSets(); + super.clean(); + } + + @TestTemplate + public void testInsertFromSourceTable() throws Exception { + // Register the rows into a temporary table. + getTableEnv() + .createTemporaryView( + "sourceTable", + getTableEnv() + .fromValues( + SimpleDataUtil.FLINK_SCHEMA.toSourceRowDataType(), + Expressions.row(1, "hello"), + Expressions.row(2, "world"), + Expressions.row(3, (String) null), + Expressions.row(null, "bar"))); + + // Redirect the records from source table to destination table. + sql("INSERT INTO %s SELECT id,data from sourceTable", TABLE_NAME); + + // Assert the table records as expected. + SimpleDataUtil.assertTableRecords( + icebergTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "hello"), + SimpleDataUtil.createRecord(2, "world"), + SimpleDataUtil.createRecord(3, null), + SimpleDataUtil.createRecord(null, "bar"))); + } + + @TestTemplate + public void testOverwriteTable() throws Exception { + assumeThat(isStreamingJob) + .as("Flink unbounded streaming does not support overwrite operation") + .isFalse(); + + sql("INSERT INTO %s SELECT 1, 'a'", TABLE_NAME); + SimpleDataUtil.assertTableRecords( + icebergTable, Lists.newArrayList(SimpleDataUtil.createRecord(1, "a"))); + + sql("INSERT OVERWRITE %s SELECT 2, 'b'", TABLE_NAME); + SimpleDataUtil.assertTableRecords( + icebergTable, Lists.newArrayList(SimpleDataUtil.createRecord(2, "b"))); + } + + @TestTemplate + public void testReplacePartitions() throws Exception { + assumeThat(isStreamingJob) + .as("Flink unbounded streaming does not support overwrite operation") + .isFalse(); + String tableName = "test_partition"; + sql( + "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", + tableName, format.name()); + + try { + Table partitionedTable = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); + + sql("INSERT INTO %s SELECT 1, 'a'", tableName); + sql("INSERT INTO %s SELECT 2, 'b'", tableName); + sql("INSERT INTO %s SELECT 3, 'c'", tableName); + + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "b"), + SimpleDataUtil.createRecord(3, "c"))); + + sql("INSERT OVERWRITE %s SELECT 4, 'b'", tableName); + sql("INSERT OVERWRITE %s SELECT 5, 'a'", tableName); + + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(5, "a"), + SimpleDataUtil.createRecord(4, "b"), + SimpleDataUtil.createRecord(3, "c"))); + + sql("INSERT OVERWRITE %s PARTITION (data='a') SELECT 6", tableName); + + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(6, "a"), + SimpleDataUtil.createRecord(4, "b"), + SimpleDataUtil.createRecord(3, "c"))); + } finally { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); + } + } + + @TestTemplate + public void testInsertIntoPartition() throws Exception { + String tableName = "test_insert_into_partition"; + sql( + "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", + tableName, format.name()); + + try { + Table partitionedTable = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); + + // Full partition. + sql("INSERT INTO %s PARTITION (data='a') SELECT 1", tableName); + sql("INSERT INTO %s PARTITION (data='a') SELECT 2", tableName); + sql("INSERT INTO %s PARTITION (data='b') SELECT 3", tableName); + + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "a"), + SimpleDataUtil.createRecord(3, "b"))); + + // Partial partition. + sql("INSERT INTO %s SELECT 4, 'c'", tableName); + sql("INSERT INTO %s SELECT 5, 'd'", tableName); + + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "a"), + SimpleDataUtil.createRecord(3, "b"), + SimpleDataUtil.createRecord(4, "c"), + SimpleDataUtil.createRecord(5, "d"))); + } finally { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkCompaction.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkCompaction.java new file mode 100644 index 000000000000..03d96ac2c573 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkCompaction.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.util.DataFormatConverters; +import org.apache.flink.table.runtime.typeutils.ExternalTypeInfo; +import org.apache.flink.table.types.DataType; +import org.apache.flink.types.Row; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.ManifestFiles; +import org.apache.iceberg.ManifestReader; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.source.BoundedTableFactory; +import org.apache.iceberg.flink.source.BoundedTestSource; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; + +public class TestFlinkTableSinkCompaction extends CatalogTestBase { + + private static final TypeInformation ROW_TYPE_INFO = + new RowTypeInfo( + SimpleDataUtil.FLINK_SCHEMA.getColumnDataTypes().stream() + .map(ExternalTypeInfo::of) + .toArray(TypeInformation[]::new)); + + private static final DataFormatConverters.RowConverter CONVERTER = + new DataFormatConverters.RowConverter( + SimpleDataUtil.FLINK_SCHEMA.getColumnDataTypes().toArray(DataType[]::new)); + + private static final String TABLE_NAME = "test_table"; + private StreamTableEnvironment tEnv; + private StreamExecutionEnvironment env; + private Table icebergTable; + private static final String TABLE_PROPERTIES = + "'flink-maintenance.lock.type'='jdbc'," + + "'flink-maintenance.lock.jdbc.uri'='jdbc:sqlite:file::memory:?ic'," + + "'flink-maintenance.lock.jdbc.init-lock-table'='true'," + + "'flink-maintenance.rewrite.rewrite-all'='true'," + + "'flink-maintenance.rewrite.schedule.data-file-size'='1'," + + "'flink-maintenance.lock-check-delay-seconds'='60'"; + + @Parameter(index = 2) + private boolean userSqlHint; + + @Parameters(name = "catalogName={0}, baseNamespace={1}, userSqlHint={2}") + public static List parameters() { + return Arrays.asList( + new Object[] {"testhadoop_basenamespace", Namespace.of("l0", "l1"), true}, + new Object[] {"testhadoop_basenamespace", Namespace.of("l0", "l1"), false}); + } + + @Override + protected StreamTableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); + settingsBuilder.inStreamingMode(); + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); + env.enableCheckpointing(100); + tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); + } + } + + tEnv.getConfig() + .getConfiguration() + .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_V2_SINK, true) + .set(FlinkWriteOptions.COMPACTION_ENABLE, true); + + return tEnv; + } + + @Override + @BeforeEach + public void before() { + super.before(); + sql("CREATE DATABASE %s", flinkDatabase); + sql("USE CATALOG %s", catalogName); + sql("USE %s", DATABASE); + if (userSqlHint) { + sql("CREATE TABLE %s (id int, data varchar)", TABLE_NAME); + } else { + sql("CREATE TABLE %s (id int, data varchar) with (%s)", TABLE_NAME, TABLE_PROPERTIES); + } + + icebergTable = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + } + + @Override + @AfterEach + public void clean() { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME); + dropDatabase(flinkDatabase, true); + BoundedTableFactory.clearDataSets(); + super.clean(); + } + + @TestTemplate + public void testSQLCompactionE2e() throws Exception { + List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); + DataStream dataStream = + env.addSource(new BoundedTestSource<>(rows.toArray(new Row[0])), ROW_TYPE_INFO) + .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); + + getTableEnv().createTemporaryView("sourceTable", dataStream); + + // Redirect the records from source table to destination table. + if (userSqlHint) { + sql( + "INSERT INTO %s /*+ OPTIONS(%s) */ SELECT id,data from sourceTable", + TABLE_NAME, TABLE_PROPERTIES); + } else { + sql("INSERT INTO %s SELECT id,data from sourceTable", TABLE_NAME); + } + + // Assert the table records as expected. + SimpleDataUtil.assertTableRecords( + icebergTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "hello"), + SimpleDataUtil.createRecord(2, "world"), + SimpleDataUtil.createRecord(3, "foo"))); + + // check the data file count after compact + List afterCompactDataFiles = + getDataFiles(icebergTable.currentSnapshot(), icebergTable); + assertThat(afterCompactDataFiles).hasSize(1); + + // check the data file count before compact + List preCompactDataFiles = + getDataFiles( + icebergTable.snapshot(icebergTable.currentSnapshot().parentId()), icebergTable); + assertThat(preCompactDataFiles).hasSize(3); + } + + private List getDataFiles(Snapshot snapshot, Table table) throws IOException { + List dataFiles = Lists.newArrayList(); + for (ManifestFile dataManifest : snapshot.dataManifests(table.io())) { + try (ManifestReader reader = ManifestFiles.read(dataManifest, table.io())) { + reader.iterator().forEachRemaining(dataFiles::add); + } + } + + return dataFiles; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java new file mode 100644 index 000000000000..3afabf6e0795 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java @@ -0,0 +1,388 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.flink.FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HADOOP; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import org.apache.flink.api.dag.Transformation; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.sink.v2.DiscardingSink; +import org.apache.flink.streaming.api.transformations.SinkTransformation; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.table.api.internal.TableEnvironmentImpl; +import org.apache.flink.table.operations.ModifyOperation; +import org.apache.flink.table.planner.delegation.PlannerBase; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.sink.IcebergSink; +import org.apache.iceberg.flink.source.BoundedTableFactory; +import org.apache.iceberg.hadoop.HadoopCatalog; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +/** + * This class tests the more extended features of Flink sink. Extract them separately since it is + * unnecessary to test all the parameters combinations in {@link TestFlinkTableSink}, like catalog + * types, namespaces, file format, streaming/batch. Those combinations explode exponentially. Each + * test method in {@link TestFlinkTableSink} runs 21 combinations, which are expensive and slow. + */ +@ExtendWith(ParameterizedTestExtension.class) +public class TestFlinkTableSinkExtended extends SqlBase { + protected static final String CATALOG = "testhadoop"; + protected static final String DATABASE = "db"; + protected static final String TABLE = "tbl"; + + @RegisterExtension + public static MiniClusterExtension miniClusterResource = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + private static final String SOURCE_TABLE = "default_catalog.default_database.bounded_source"; + private static final String FLINK_DATABASE = CATALOG + "." + DATABASE; + private static final Namespace ICEBERG_NAMESPACE = Namespace.of(new String[] {DATABASE}); + + @TempDir protected File warehouseRoot; + + protected HadoopCatalog catalog = null; + + private TableEnvironment tEnv; + + @Parameter(index = 0) + protected boolean isStreamingJob; + + @Parameter(index = 1) + protected Boolean useV2Sink; + + @Parameters(name = "isStreamingJob={0}, useV2Sink={1}") + protected static List parameters() { + return Arrays.asList( + new Object[] {true, false}, + new Object[] {false, false}, + new Object[] {true, true}, + new Object[] {false, true}, + new Object[] {true, null}); + } + + protected synchronized TableEnvironment getTableEnv() { + if (tEnv == null) { + EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); + if (isStreamingJob) { + settingsBuilder.inStreamingMode(); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); + env.enableCheckpointing(400); + env.setMaxParallelism(2); + env.setParallelism(2); + tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); + } else { + settingsBuilder.inBatchMode(); + tEnv = TableEnvironment.create(settingsBuilder.build()); + } + } + + if (useV2Sink != null) { + tEnv.getConfig() + .getConfiguration() + .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_V2_SINK, useV2Sink); + } + + return tEnv; + } + + @BeforeEach + public void before() { + String warehouseLocation = "file:" + warehouseRoot.getPath(); + this.catalog = new HadoopCatalog(new Configuration(), warehouseLocation); + Map config = Maps.newHashMap(); + config.put("type", "iceberg"); + config.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, ICEBERG_CATALOG_TYPE_HADOOP); + config.put(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation); + sql("CREATE CATALOG %s WITH %s", CATALOG, toWithClause(config)); + + sql("CREATE DATABASE %s", FLINK_DATABASE); + sql("USE CATALOG %s", CATALOG); + sql("USE %s", DATABASE); + sql( + "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", + TABLE, FileFormat.PARQUET.name()); + } + + @AfterEach + public void clean() throws Exception { + sql("DROP TABLE IF EXISTS %s.%s", FLINK_DATABASE, TABLE); + dropDatabase(FLINK_DATABASE, true); + BoundedTableFactory.clearDataSets(); + + dropCatalog(CATALOG, true); + catalog.close(); + } + + @TestTemplate + public void testUsedFlinkSinkInterface() { + String dataId = BoundedTableFactory.registerDataSet(Collections.emptyList()); + sql( + "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + + " WITH ('connector'='BoundedSource', 'data-id'='%s')", + SOURCE_TABLE, dataId); + + PlannerBase planner = (PlannerBase) ((TableEnvironmentImpl) getTableEnv()).getPlanner(); + String insertSQL = String.format("INSERT INTO %s SELECT * FROM %s", TABLE, SOURCE_TABLE); + ModifyOperation operation = (ModifyOperation) planner.getParser().parse(insertSQL).get(0); + Transformation transformation = + planner.translate(Collections.singletonList(operation)).get(0); + assertThat(transformation).as("Should use SinkV2 API").isInstanceOf(SinkTransformation.class); + SinkTransformation sinkTransformation = (SinkTransformation) transformation; + if (useV2Sink != null && useV2Sink) { + assertThat(sinkTransformation.getSink()) + .as("Should use SinkV2 API based implementation") + .isInstanceOf(IcebergSink.class); + } else { + assertThat(sinkTransformation.getSink()) + .as("Should use custom chain of StreamOperators terminated by DiscardingSink") + .isInstanceOf(DiscardingSink.class); + } + } + + @TestTemplate + public void testWriteParallelism() { + List dataSet = + IntStream.range(1, 1000) + .mapToObj(i -> ImmutableList.of(Row.of(i, "aaa"), Row.of(i, "bbb"), Row.of(i, "ccc"))) + .flatMap(List::stream) + .collect(Collectors.toList()); + String dataId = BoundedTableFactory.registerDataSet(ImmutableList.of(dataSet)); + sql( + "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + + " WITH ('connector'='BoundedSource', 'data-id'='%s')", + SOURCE_TABLE, dataId); + + PlannerBase planner = (PlannerBase) ((TableEnvironmentImpl) getTableEnv()).getPlanner(); + String insertSQL = + String.format( + "INSERT INTO %s /*+ OPTIONS('write-parallelism'='1') */ SELECT * FROM %s", + TABLE, SOURCE_TABLE); + ModifyOperation operation = (ModifyOperation) planner.getParser().parse(insertSQL).get(0); + Transformation sink = planner.translate(Collections.singletonList(operation)).get(0); + if (useV2Sink != null && useV2Sink) { + assertThat(sink.getParallelism()).as("Should have the expected 1 parallelism.").isEqualTo(1); + Transformation writerInput = sink.getInputs().get(0); + assertThat(writerInput.getParallelism()) + .as("Should have the expected parallelism.") + .isEqualTo(isStreamingJob ? 2 : 4); + } else { + Transformation committer = sink.getInputs().get(0); + Transformation writer = committer.getInputs().get(0); + + assertThat(writer.getParallelism()) + .as("Should have the expected 1 parallelism.") + .isEqualTo(1); + Transformation writerInput = writer.getInputs().get(0); + assertThat(writerInput.getParallelism()) + .as("Should have the expected parallelism.") + .isEqualTo(isStreamingJob ? 2 : 4); + } + } + + @TestTemplate + public void testHashDistributeMode() throws Exception { + // Initialize a BoundedSource table to precisely emit those rows in only one checkpoint. + List dataSet = + IntStream.range(1, 1000) + .mapToObj(i -> ImmutableList.of(Row.of(i, "aaa"), Row.of(i, "bbb"), Row.of(i, "ccc"))) + .flatMap(List::stream) + .collect(Collectors.toList()); + String dataId = BoundedTableFactory.registerDataSet(ImmutableList.of(dataSet)); + sql( + "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + + " WITH ('connector'='BoundedSource', 'data-id'='%s')", + SOURCE_TABLE, dataId); + + assertThat(sql("SELECT * FROM %s", SOURCE_TABLE)) + .as("Should have the expected rows in source table.") + .containsExactlyInAnyOrderElementsOf(dataSet); + + Map tableProps = + ImmutableMap.of( + "write.format.default", + FileFormat.PARQUET.name(), + TableProperties.WRITE_DISTRIBUTION_MODE, + DistributionMode.HASH.modeName()); + + String tableName = "test_hash_distribution_mode"; + sql( + "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH %s", + tableName, toWithClause(tableProps)); + + try { + // Insert data set. + sql("INSERT INTO %s SELECT * FROM %s", tableName, SOURCE_TABLE); + + assertThat(sql("SELECT * FROM %s", tableName)) + .as("Should have the expected rows in sink table.") + .containsExactlyInAnyOrderElementsOf(dataSet); + + // Sometimes we will have more than one checkpoint if we pass the auto checkpoint interval, + // thus producing multiple snapshots. Here we assert that each snapshot has only 1 file per + // partition. + Table table = catalog.loadTable(TableIdentifier.of(ICEBERG_NAMESPACE, tableName)); + Map> snapshotToDataFiles = SimpleDataUtil.snapshotToDataFiles(table); + for (List dataFiles : snapshotToDataFiles.values()) { + if (dataFiles.isEmpty()) { + continue; + } + + assertThat( + SimpleDataUtil.matchingPartitions( + dataFiles, table.spec(), ImmutableMap.of("data", "aaa"))) + .hasSize(1); + assertThat( + SimpleDataUtil.matchingPartitions( + dataFiles, table.spec(), ImmutableMap.of("data", "bbb"))) + .hasSize(1); + assertThat( + SimpleDataUtil.matchingPartitions( + dataFiles, table.spec(), ImmutableMap.of("data", "ccc"))) + .hasSize(1); + } + } finally { + sql("DROP TABLE IF EXISTS %s.%s", FLINK_DATABASE, tableName); + } + } + + @TestTemplate + public void testRangeDistributionPartitionColumn() { + // Range partitioner currently only works with streaming writes (with checkpoints) + assumeThat(isStreamingJob).isTrue(); + + // Initialize a BoundedSource table to precisely emit those rows in only one checkpoint. + List> rowsPerCheckpoint = + IntStream.range(1, 6) + .mapToObj( + checkpointId -> { + List charRows = Lists.newArrayList(); + // emit 26x10 rows for each checkpoint cycle + for (int i = 0; i < 10; ++i) { + for (char c = 'a'; c <= 'z'; c++) { + charRows.add(Row.of(c - 'a', String.valueOf(c))); + } + } + return charRows; + }) + .collect(Collectors.toList()); + List flattenedRows = + rowsPerCheckpoint.stream().flatMap(List::stream).collect(Collectors.toList()); + + String dataId = BoundedTableFactory.registerDataSet(rowsPerCheckpoint); + sql( + "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + + " WITH ('connector'='BoundedSource', 'data-id'='%s')", + SOURCE_TABLE, dataId); + + assertThat(sql("SELECT * FROM %s", SOURCE_TABLE)) + .as("Should have the expected rows in source table.") + .containsExactlyInAnyOrderElementsOf(flattenedRows); + + Map tableProps = + ImmutableMap.of( + "write.format.default", + FileFormat.PARQUET.name(), + TableProperties.WRITE_DISTRIBUTION_MODE, + DistributionMode.RANGE.modeName()); + + String tableName = "test_hash_distribution_mode"; + sql( + "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH %s", + tableName, toWithClause(tableProps)); + + try { + // Insert data set. + sql("INSERT INTO %s SELECT * FROM %s", tableName, SOURCE_TABLE); + + assertThat(sql("SELECT * FROM %s", tableName)) + .as("Should have the expected rows in sink table.") + .containsExactlyInAnyOrderElementsOf(flattenedRows); + + Table table = catalog.loadTable(TableIdentifier.of(ICEBERG_NAMESPACE, tableName)); + // ordered in reverse timeline from the newest snapshot to the oldest snapshot + List snapshots = Lists.newArrayList(table.snapshots().iterator()); + // only keep the snapshots with added data files + snapshots = + snapshots.stream() + .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) + .collect(Collectors.toList()); + + // Sometimes we will have more checkpoints than the bounded source if we pass the + // auto checkpoint interval. Thus producing multiple snapshots. + assertThat(snapshots).hasSizeGreaterThanOrEqualTo(5); + + // It takes 2 checkpoint cycle for statistics collection and application + // of the globally aggregated statistics in the range partitioner. + // The last two checkpoints should have range shuffle applied + List rangePartitionedCycles = + snapshots.subList(snapshots.size() - 2, snapshots.size()); + + for (Snapshot snapshot : rangePartitionedCycles) { + List addedDataFiles = + Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); + // range partition results in each partition only assigned to one writer task + // maybe less than 26 partitions as BoundedSource doesn't always precisely + // control the checkpoint boundary. + // It is hard to precisely control the test condition in SQL tests. + // Here only minimal safe assertions are applied to avoid flakiness. + // If there are no shuffling, the number of data files could be as high as + // 26 * 4 as the default parallelism is set to 4 for the mini cluster. + assertThat(addedDataFiles).hasSizeLessThanOrEqualTo(26); + } + } finally { + sql("DROP TABLE IF EXISTS %s.%s", FLINK_DATABASE, tableName); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java new file mode 100644 index 000000000000..c5a7ec4beec6 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java @@ -0,0 +1,336 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.time.LocalDate; +import java.util.List; +import java.util.Map; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.Timeout; + +@Timeout(60) +public class TestFlinkUpsert extends CatalogTestBase { + + @Parameter(index = 2) + private FileFormat format; + + @Parameter(index = 3) + private boolean isStreamingJob; + + private final Map tableUpsertProps = Maps.newHashMap(); + private TableEnvironment tEnv; + + @Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") + public static List parameters() { + List parameters = Lists.newArrayList(); + for (FileFormat format : + new FileFormat[] {FileFormat.PARQUET, FileFormat.AVRO, FileFormat.ORC}) { + for (Boolean isStreaming : new Boolean[] {true, false}) { + // Only test with one catalog as this is a file operation concern. + // FlinkCatalogTestBase requires the catalog name start with testhadoop if using hadoop + // catalog. + String catalogName = "testhadoop"; + Namespace baseNamespace = Namespace.of("default"); + parameters.add(new Object[] {catalogName, baseNamespace, format, isStreaming}); + } + } + return parameters; + } + + @Override + protected TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); + if (isStreamingJob) { + settingsBuilder.inStreamingMode(); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); + env.enableCheckpointing(400); + env.setMaxParallelism(2); + env.setParallelism(2); + tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); + } else { + settingsBuilder.inBatchMode(); + tEnv = TableEnvironment.create(settingsBuilder.build()); + } + } + } + return tEnv; + } + + @Override + @BeforeEach + public void before() { + super.before(); + sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); + sql("USE CATALOG %s", catalogName); + sql("USE %s", DATABASE); + tableUpsertProps.put(TableProperties.FORMAT_VERSION, "2"); + tableUpsertProps.put(TableProperties.UPSERT_ENABLED, "true"); + tableUpsertProps.put(TableProperties.DEFAULT_FILE_FORMAT, format.name()); + } + + @Override + @AfterEach + public void clean() { + dropDatabase(flinkDatabase, true); + super.clean(); + } + + @TestTemplate + public void testUpsertAndQuery() { + String tableName = "test_upsert_query"; + LocalDate dt20220301 = LocalDate.of(2022, 3, 1); + LocalDate dt20220302 = LocalDate.of(2022, 3, 2); + + sql( + "CREATE TABLE %s(id INT NOT NULL, name STRING NOT NULL, dt DATE, PRIMARY KEY(id,dt) NOT ENFORCED) " + + "PARTITIONED BY (dt) WITH %s", + tableName, toWithClause(tableUpsertProps)); + + try { + sql( + "INSERT INTO %s VALUES " + + "(1, 'Bill', DATE '2022-03-01')," + + "(1, 'Jane', DATE '2022-03-01')," + + "(2, 'Jane', DATE '2022-03-01')", + tableName); + + sql( + "INSERT INTO %s VALUES " + + "(2, 'Bill', DATE '2022-03-01')," + + "(1, 'Jane', DATE '2022-03-02')," + + "(2, 'Jane', DATE '2022-03-02')", + tableName); + + List rowsOn20220301 = + Lists.newArrayList(Row.of(1, "Jane", dt20220301), Row.of(2, "Bill", dt20220301)); + TestHelpers.assertRows( + sql("SELECT * FROM %s WHERE dt < '2022-03-02'", tableName), rowsOn20220301); + + List rowsOn20220302 = + Lists.newArrayList(Row.of(1, "Jane", dt20220302), Row.of(2, "Jane", dt20220302)); + TestHelpers.assertRows( + sql("SELECT * FROM %s WHERE dt = '2022-03-02'", tableName), rowsOn20220302); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList(Iterables.concat(rowsOn20220301, rowsOn20220302))); + } finally { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); + } + } + + @TestTemplate + public void testUpsertOptions() { + String tableName = "test_upsert_options"; + LocalDate dt20220301 = LocalDate.of(2022, 3, 1); + LocalDate dt20220302 = LocalDate.of(2022, 3, 2); + + Map optionsUpsertProps = Maps.newHashMap(tableUpsertProps); + optionsUpsertProps.remove(TableProperties.UPSERT_ENABLED); + sql( + "CREATE TABLE %s(id INT NOT NULL, name STRING NOT NULL, dt DATE, PRIMARY KEY(id,dt) NOT ENFORCED) " + + "PARTITIONED BY (dt) WITH %s", + tableName, toWithClause(optionsUpsertProps)); + + try { + sql( + "INSERT INTO %s /*+ OPTIONS('upsert-enabled'='true')*/ VALUES " + + "(1, 'Bill', DATE '2022-03-01')," + + "(1, 'Jane', DATE '2022-03-01')," + + "(2, 'Jane', DATE '2022-03-01')", + tableName); + + sql( + "INSERT INTO %s /*+ OPTIONS('upsert-enabled'='true')*/ VALUES " + + "(2, 'Bill', DATE '2022-03-01')," + + "(1, 'Jane', DATE '2022-03-02')," + + "(2, 'Jane', DATE '2022-03-02')", + tableName); + + List rowsOn20220301 = + Lists.newArrayList(Row.of(1, "Jane", dt20220301), Row.of(2, "Bill", dt20220301)); + TestHelpers.assertRows( + sql("SELECT * FROM %s WHERE dt < '2022-03-02'", tableName), rowsOn20220301); + + List rowsOn20220302 = + Lists.newArrayList(Row.of(1, "Jane", dt20220302), Row.of(2, "Jane", dt20220302)); + TestHelpers.assertRows( + sql("SELECT * FROM %s WHERE dt = '2022-03-02'", tableName), rowsOn20220302); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList(Iterables.concat(rowsOn20220301, rowsOn20220302))); + } finally { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); + } + } + + @TestTemplate + public void testPrimaryKeyEqualToPartitionKey() { + // This is an SQL based reproduction of TestFlinkIcebergSinkV2#testUpsertOnDataKey + String tableName = "upsert_on_id_key"; + try { + sql( + "CREATE TABLE %s(id INT NOT NULL, name STRING NOT NULL, PRIMARY KEY(id) NOT ENFORCED) " + + "PARTITIONED BY (id) WITH %s", + tableName, toWithClause(tableUpsertProps)); + + sql("INSERT INTO %s VALUES " + "(1, 'Bill')," + "(1, 'Jane')," + "(2, 'Bill')", tableName); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList(Row.of(1, "Jane"), Row.of(2, "Bill"))); + + sql("INSERT INTO %s VALUES " + "(1, 'Bill')," + "(2, 'Jane')", tableName); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList(Row.of(1, "Bill"), Row.of(2, "Jane"))); + + sql("INSERT INTO %s VALUES " + "(3, 'Bill')," + "(4, 'Jane')", tableName); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList( + Row.of(1, "Bill"), Row.of(2, "Jane"), Row.of(3, "Bill"), Row.of(4, "Jane"))); + } finally { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); + } + } + + @TestTemplate + public void testPrimaryKeyFieldsAtBeginningOfSchema() { + String tableName = "upsert_on_pk_at_schema_start"; + LocalDate dt = LocalDate.of(2022, 3, 1); + try { + sql( + "CREATE TABLE %s(id INT, dt DATE NOT NULL, name STRING NOT NULL, PRIMARY KEY(id,dt) NOT ENFORCED) " + + "PARTITIONED BY (dt) WITH %s", + tableName, toWithClause(tableUpsertProps)); + + sql( + "INSERT INTO %s VALUES " + + "(1, DATE '2022-03-01', 'Andy')," + + "(1, DATE '2022-03-01', 'Bill')," + + "(2, DATE '2022-03-01', 'Jane')", + tableName); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList(Row.of(1, dt, "Bill"), Row.of(2, dt, "Jane"))); + + sql( + "INSERT INTO %s VALUES " + + "(1, DATE '2022-03-01', 'Jane')," + + "(2, DATE '2022-03-01', 'Bill')", + tableName); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList(Row.of(1, dt, "Jane"), Row.of(2, dt, "Bill"))); + + sql( + "INSERT INTO %s VALUES " + + "(3, DATE '2022-03-01', 'Duke')," + + "(4, DATE '2022-03-01', 'Leon')", + tableName); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList( + Row.of(1, dt, "Jane"), + Row.of(2, dt, "Bill"), + Row.of(3, dt, "Duke"), + Row.of(4, dt, "Leon"))); + } finally { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); + } + } + + @TestTemplate + public void testPrimaryKeyFieldsAtEndOfTableSchema() { + // This is the same test case as testPrimaryKeyFieldsAtBeginningOfSchema, but the primary key + // fields + // are located at the end of the flink schema. + String tableName = "upsert_on_pk_at_schema_end"; + LocalDate dt = LocalDate.of(2022, 3, 1); + try { + sql( + "CREATE TABLE %s(name STRING NOT NULL, id INT, dt DATE NOT NULL, PRIMARY KEY(id,dt) NOT ENFORCED) " + + "PARTITIONED BY (dt) WITH %s", + tableName, toWithClause(tableUpsertProps)); + + sql( + "INSERT INTO %s VALUES " + + "('Andy', 1, DATE '2022-03-01')," + + "('Bill', 1, DATE '2022-03-01')," + + "('Jane', 2, DATE '2022-03-01')", + tableName); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList(Row.of("Bill", 1, dt), Row.of("Jane", 2, dt))); + + sql( + "INSERT INTO %s VALUES " + + "('Jane', 1, DATE '2022-03-01')," + + "('Bill', 2, DATE '2022-03-01')", + tableName); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList(Row.of("Jane", 1, dt), Row.of("Bill", 2, dt))); + + sql( + "INSERT INTO %s VALUES " + + "('Duke', 3, DATE '2022-03-01')," + + "('Leon', 4, DATE '2022-03-01')", + tableName); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList( + Row.of("Jane", 1, dt), + Row.of("Bill", 2, dt), + Row.of("Duke", 3, dt), + Row.of("Leon", 4, dt))); + } finally { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java new file mode 100644 index 000000000000..d8d3c5dc249b --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java @@ -0,0 +1,669 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.OffsetDateTime; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.function.Consumer; +import java.util.stream.Collectors; +import org.apache.avro.generic.GenericData; +import org.apache.flink.api.common.serialization.SerializerConfigImpl; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.data.conversion.DataStructureConverter; +import org.apache.flink.table.data.conversion.DataStructureConverters; +import org.apache.flink.table.runtime.typeutils.InternalSerializers; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.flink.types.Row; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.data.GenericDataUtil; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.data.RowDataUtil; +import org.apache.iceberg.flink.source.FlinkInputFormat; +import org.apache.iceberg.flink.source.FlinkInputSplit; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Streams; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.DateTimeUtil; + +public class TestHelpers { + private TestHelpers() {} + + public static T roundTripKryoSerialize(Class clazz, T table) throws IOException { + KryoSerializer kryo = new KryoSerializer<>(clazz, new SerializerConfigImpl()); + + DataOutputSerializer outputView = new DataOutputSerializer(1024); + kryo.serialize(table, outputView); + + DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); + return kryo.deserialize(inputView); + } + + public static RowData copyRowData(RowData from, RowType rowType) { + TypeSerializer[] fieldSerializers = + rowType.getChildren().stream() + .map((LogicalType type) -> InternalSerializers.create(type)) + .toArray(TypeSerializer[]::new); + RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[rowType.getFieldCount()]; + for (int i = 0; i < rowType.getFieldCount(); ++i) { + fieldGetters[i] = FlinkRowData.createFieldGetter(rowType.getTypeAt(i), i); + } + + return RowDataUtil.clone(from, null, rowType, fieldSerializers, fieldGetters); + } + + public static void readRowData(FlinkInputFormat input, Consumer visitor) + throws IOException { + for (FlinkInputSplit s : input.createInputSplits(0)) { + input.open(s); + try { + while (!input.reachedEnd()) { + RowData row = input.nextRecord(null); + visitor.accept(row); + } + } finally { + input.close(); + } + } + } + + public static List readRowData(FlinkInputFormat inputFormat, RowType rowType) + throws IOException { + List results = Lists.newArrayList(); + readRowData(inputFormat, row -> results.add(copyRowData(row, rowType))); + return results; + } + + public static List readRows(FlinkInputFormat inputFormat, RowType rowType) + throws IOException { + return convertRowDataToRow(readRowData(inputFormat, rowType), rowType); + } + + public static List convertRowDataToRow(List rowDataList, RowType rowType) { + DataStructureConverter converter = + DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(rowType)); + return rowDataList.stream() + .map(converter::toExternal) + .map(Row.class::cast) + .collect(Collectors.toList()); + } + + public static List convertRecordToRow(List expectedRecords, Schema schema) { + List expected = Lists.newArrayList(); + @SuppressWarnings("unchecked") + DataStructureConverter converter = + (DataStructureConverter) + DataStructureConverters.getConverter( + TypeConversions.fromLogicalToDataType(FlinkSchemaUtil.convert(schema))); + expectedRecords.forEach( + r -> expected.add(converter.toExternal(RowDataConverter.convert(schema, r)))); + return expected; + } + + public static void assertRecordsWithOrder( + List results, List expectedRecords, Schema schema) { + List expected = convertRecordToRow(expectedRecords, schema); + assertRowsWithOrder(results, expected); + } + + public static void assertRecords(List results, List expectedRecords, Schema schema) { + List expected = convertRecordToRow(expectedRecords, schema); + assertRows(results, expected); + } + + public static void assertRows(List results, List expected, RowType rowType) { + assertRows(convertRowDataToRow(results, rowType), convertRowDataToRow(expected, rowType)); + } + + public static void assertRows(List results, List expected) { + assertThat(results).containsExactlyInAnyOrderElementsOf(expected); + } + + public static void assertRowsWithOrder(List results, List expected) { + assertThat(results).containsExactlyElementsOf(expected); + } + + public static void assertRowData(Schema schema, StructLike expected, RowData actual) { + assertRowData(schema.asStruct(), FlinkSchemaUtil.convert(schema), expected, actual); + } + + public static void assertRowData( + Types.StructType structType, + LogicalType rowType, + StructLike expectedRecord, + RowData actualRowData) { + if (expectedRecord == null && actualRowData == null) { + return; + } + + assertThat(expectedRecord).isNotNull(); + assertThat(actualRowData).isNotNull(); + + List types = Lists.newArrayList(); + for (Types.NestedField field : structType.fields()) { + types.add(field.type()); + } + + if (expectedRecord instanceof Record) { + Record expected = (Record) expectedRecord; + Types.StructType expectedType = expected.struct(); + int pos = 0; + for (Types.NestedField field : structType.fields()) { + Types.NestedField expectedField = expectedType.field(field.fieldId()); + LogicalType logicalType = ((RowType) rowType).getTypeAt(pos); + Object actualValue = + FlinkRowData.createFieldGetter(logicalType, pos).getFieldOrNull(actualRowData); + if (expectedField != null) { + assertEquals( + field.type(), logicalType, expected.getField(expectedField.name()), actualValue); + } else { + // convert the initial value to generic because that is the data model used to generate + // the expected records + assertEquals( + field.type(), + logicalType, + GenericDataUtil.internalToGeneric(field.type(), field.initialDefault()), + actualValue); + } + pos += 1; + } + + } else { + for (int i = 0; i < types.size(); i += 1) { + LogicalType logicalType = ((RowType) rowType).getTypeAt(i); + Object expected = expectedRecord.get(i, Object.class); + Object actual = + FlinkRowData.createFieldGetter(logicalType, i).getFieldOrNull(actualRowData); + assertEquals(types.get(i), logicalType, expected, actual); + } + } + } + + private static void assertEquals( + Type type, LogicalType logicalType, Object expected, Object actual) { + + if (expected == null && actual == null) { + return; + } + + assertThat(expected).isNotNull(); + assertThat(actual).isNotNull(); + + switch (type.typeId()) { + case BOOLEAN: + assertThat(actual).as("boolean value should be equal").isEqualTo(expected); + break; + case INTEGER: + assertThat(actual).as("int value should be equal").isEqualTo(expected); + break; + case LONG: + assertThat(actual).as("long value should be equal").isEqualTo(expected); + break; + case FLOAT: + assertThat(actual).as("float value should be equal").isEqualTo(expected); + break; + case DOUBLE: + assertThat(actual).as("double value should be equal").isEqualTo(expected); + break; + case STRING: + assertThat(expected).as("Should expect a CharSequence").isInstanceOf(CharSequence.class); + assertThat(actual.toString()) + .as("string should be equal") + .isEqualTo(String.valueOf(expected)); + break; + case DATE: + assertThat(expected).as("Should expect a Date").isInstanceOf(LocalDate.class); + LocalDate date = DateTimeUtil.dateFromDays((int) actual); + assertThat(date).as("date should be equal").isEqualTo(expected); + break; + case TIME: + assertThat(expected).as("Should expect a LocalTime").isInstanceOf(LocalTime.class); + int milliseconds = (int) (((LocalTime) expected).toNanoOfDay() / 1000_000); + assertThat(actual).as("time millis should be equal").isEqualTo(milliseconds); + break; + case TIMESTAMP: + if (((Types.TimestampType) type).shouldAdjustToUTC()) { + assertThat(expected) + .as("Should expect a OffsetDataTime") + .isInstanceOf(OffsetDateTime.class); + OffsetDateTime ts = (OffsetDateTime) expected; + assertThat(((TimestampData) actual).toLocalDateTime()) + .as("OffsetDataTime should be equal") + .isEqualTo(ts.toLocalDateTime()); + } else { + assertThat(expected) + .as("Should expect a LocalDataTime") + .isInstanceOf(LocalDateTime.class); + LocalDateTime ts = (LocalDateTime) expected; + assertThat(((TimestampData) actual).toLocalDateTime()) + .as("LocalDataTime should be equal") + .isEqualTo(ts); + } + break; + case TIMESTAMP_NANO: + if (((Types.TimestampNanoType) type).shouldAdjustToUTC()) { + assertThat(expected) + .as("Should expect a OffsetDataTime") + .isInstanceOf(OffsetDateTime.class); + OffsetDateTime ts = (OffsetDateTime) expected; + assertThat(((TimestampData) actual).toLocalDateTime()) + .as("OffsetDataTime should be equal") + .isEqualTo(ts.toLocalDateTime()); + } else { + assertThat(expected) + .as("Should expect a LocalDataTime") + .isInstanceOf(LocalDateTime.class); + LocalDateTime ts = (LocalDateTime) expected; + assertThat(((TimestampData) actual).toLocalDateTime()) + .as("LocalDataTime should be equal") + .isEqualTo(ts); + } + break; + case BINARY: + assertThat(ByteBuffer.wrap((byte[]) actual)) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class) + .isEqualTo(expected); + break; + case DECIMAL: + assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + BigDecimal bd = (BigDecimal) expected; + assertThat(((DecimalData) actual).toBigDecimal()) + .as("decimal value should be equal") + .isEqualTo(bd); + break; + case LIST: + assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Collection expectedArrayData = (Collection) expected; + ArrayData actualArrayData = (ArrayData) actual; + LogicalType elementType = ((ArrayType) logicalType).getElementType(); + assertThat(actualArrayData.size()) + .as("array length should be equal") + .isEqualTo(expectedArrayData.size()); + assertArrayValues( + type.asListType().elementType(), elementType, expectedArrayData, actualArrayData); + break; + case MAP: + assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); + assertMapValues(type.asMapType(), logicalType, (Map) expected, (MapData) actual); + break; + case STRUCT: + assertThat(expected).as("Should expect a Record").isInstanceOf(StructLike.class); + assertRowData(type.asStructType(), logicalType, (StructLike) expected, (RowData) actual); + break; + case UUID: + assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); + ByteBuffer bb = ByteBuffer.wrap((byte[]) actual); + long firstLong = bb.getLong(); + long secondLong = bb.getLong(); + assertThat(new UUID(firstLong, secondLong).toString()) + .as("UUID should be equal") + .isEqualTo(expected.toString()); + break; + case FIXED: + assertThat(actual) + .as("Should expect byte[]") + .isInstanceOf(byte[].class) + .isEqualTo(expected); + break; + default: + throw new IllegalArgumentException("Not a supported type: " + type); + } + } + + public static void assertEquals(Schema schema, List records, List rows) { + Streams.forEachPair( + records.stream(), rows.stream(), (record, row) -> assertEquals(schema, record, row)); + } + + public static void assertEquals(Schema schema, GenericData.Record record, Row row) { + List fields = schema.asStruct().fields(); + assertThat(fields).hasSameSizeAs(record.getSchema().getFields()); + assertThat(fields).hasSize(row.getArity()); + + RowType rowType = FlinkSchemaUtil.convert(schema); + for (int i = 0; i < fields.size(); ++i) { + Type fieldType = fields.get(i).type(); + Object expectedValue = record.get(i); + Object actualValue = row.getField(i); + LogicalType logicalType = rowType.getTypeAt(i); + assertAvroEquals(fieldType, logicalType, expectedValue, actualValue); + } + } + + private static void assertEquals(Types.StructType struct, GenericData.Record record, Row row) { + List fields = struct.fields(); + for (int i = 0; i < fields.size(); i += 1) { + Type fieldType = fields.get(i).type(); + Object expectedValue = record.get(i); + Object actualValue = row.getField(i); + assertAvroEquals(fieldType, null, expectedValue, actualValue); + } + } + + private static void assertAvroEquals( + Type type, LogicalType logicalType, Object expected, Object actual) { + + if (expected == null && actual == null) { + return; + } + assertThat(expected).isNotNull(); + assertThat(actual).isNotNull(); + + switch (type.typeId()) { + case BOOLEAN: + case INTEGER: + case LONG: + case FLOAT: + case DOUBLE: + assertThat(expected) + .as("Should expect a " + type.typeId().javaClass()) + .isInstanceOf(type.typeId().javaClass()); + assertThat(actual) + .as("Should expect a " + type.typeId().javaClass()) + .isInstanceOf(type.typeId().javaClass()); + assertThat(actual).as(type.typeId() + " value should be equal").isEqualTo(expected); + break; + case STRING: + assertThat(expected).as("Should expect a CharSequence").isInstanceOf(CharSequence.class); + assertThat(actual).as("Should expect a CharSequence").isInstanceOf(CharSequence.class); + assertThat(actual.toString()).as("string should be equal").isEqualTo(expected.toString()); + break; + case DATE: + assertThat(expected).as("Should expect a Date").isInstanceOf(LocalDate.class); + LocalDate date = DateTimeUtil.dateFromDays((int) actual); + assertThat(date).as("date should be equal").isEqualTo(expected); + break; + case TIME: + assertThat(expected).as("Should expect a LocalTime").isInstanceOf(LocalTime.class); + int milliseconds = (int) (((LocalTime) expected).toNanoOfDay() / 1000_000); + assertThat(actual).as("time millis should be equal").isEqualTo(milliseconds); + break; + case TIMESTAMP: + if (((Types.TimestampType) type).shouldAdjustToUTC()) { + assertThat(expected) + .as("Should expect a OffsetDataTime") + .isInstanceOf(OffsetDateTime.class); + OffsetDateTime ts = (OffsetDateTime) expected; + assertThat(((TimestampData) actual).toLocalDateTime()) + .as("OffsetDataTime should be equal") + .isEqualTo(ts.toLocalDateTime()); + } else { + assertThat(expected) + .as("Should expect a LocalDataTime") + .isInstanceOf(LocalDateTime.class); + LocalDateTime ts = (LocalDateTime) expected; + assertThat(((TimestampData) actual).toLocalDateTime()) + .as("LocalDataTime should be equal") + .isEqualTo(ts); + } + break; + case BINARY: + assertThat(ByteBuffer.wrap((byte[]) actual)) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class) + .isEqualTo(expected); + break; + case DECIMAL: + assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + BigDecimal bd = (BigDecimal) expected; + assertThat(((DecimalData) actual).toBigDecimal()) + .as("decimal value should be equal") + .isEqualTo(bd); + break; + case LIST: + assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Collection expectedArrayData = (Collection) expected; + ArrayData actualArrayData; + try { + actualArrayData = (ArrayData) actual; + } catch (ClassCastException e) { + actualArrayData = new GenericArrayData((Object[]) actual); + } + LogicalType elementType = ((ArrayType) logicalType).getElementType(); + assertThat(actualArrayData.size()) + .as("array length should be equal") + .isEqualTo(expectedArrayData.size()); + assertArrayValues( + type.asListType().elementType(), elementType, expectedArrayData, actualArrayData); + break; + case MAP: + assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); + MapData actualMap; + try { + actualMap = (MapData) actual; + } catch (ClassCastException e) { + actualMap = new GenericMapData((Map) actual); + } + assertMapValues(type.asMapType(), logicalType, (Map) expected, actualMap); + break; + case STRUCT: + assertThat(expected).as("Should expect a Record").isInstanceOf(GenericData.Record.class); + assertEquals( + type.asNestedType().asStructType(), (GenericData.Record) expected, (Row) actual); + break; + case UUID: + assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); + ByteBuffer bb = ByteBuffer.wrap((byte[]) actual); + long firstLong = bb.getLong(); + long secondLong = bb.getLong(); + assertThat(new UUID(firstLong, secondLong).toString()) + .as("UUID should be equal") + .isEqualTo(expected.toString()); + break; + case FIXED: + assertThat(actual) + .as("Should expect byte[]") + .isInstanceOf(byte[].class) + .isEqualTo(expected); + break; + default: + throw new IllegalArgumentException("Not a supported type: " + type); + } + } + + private static void assertArrayValues( + Type type, LogicalType logicalType, Collection expectedArray, ArrayData actualArray) { + List expectedElements = Lists.newArrayList(expectedArray); + for (int i = 0; i < expectedArray.size(); i += 1) { + if (expectedElements.get(i) == null) { + assertThat(actualArray.isNullAt(i)).isTrue(); + continue; + } + + Object expected = expectedElements.get(i); + + assertEquals( + type, + logicalType, + expected, + ArrayData.createElementGetter(logicalType).getElementOrNull(actualArray, i)); + } + } + + private static void assertMapValues( + Types.MapType mapType, LogicalType type, Map expected, MapData actual) { + assertThat(actual.size()).as("map size should be equal").isEqualTo(expected.size()); + + ArrayData actualKeyArrayData = actual.keyArray(); + ArrayData actualValueArrayData = actual.valueArray(); + LogicalType actualKeyType = ((MapType) type).getKeyType(); + LogicalType actualValueType = ((MapType) type).getValueType(); + Type keyType = mapType.keyType(); + Type valueType = mapType.valueType(); + + ArrayData.ElementGetter keyGetter = ArrayData.createElementGetter(actualKeyType); + ArrayData.ElementGetter valueGetter = ArrayData.createElementGetter(actualValueType); + + for (Map.Entry entry : expected.entrySet()) { + Object matchedActualKey = null; + int matchedKeyIndex = 0; + for (int i = 0; i < actual.size(); i += 1) { + try { + Object key = keyGetter.getElementOrNull(actualKeyArrayData, i); + assertEquals(keyType, actualKeyType, entry.getKey(), key); + matchedActualKey = key; + matchedKeyIndex = i; + break; + } catch (AssertionError e) { + // not found + } + } + assertThat(matchedActualKey).as("Should have a matching key").isNotNull(); + final int valueIndex = matchedKeyIndex; + assertEquals( + valueType, + actualValueType, + entry.getValue(), + valueGetter.getElementOrNull(actualValueArrayData, valueIndex)); + } + } + + public static void assertEquals(ManifestFile expected, ManifestFile actual) { + if (expected == actual) { + return; + } + assertThat(expected).isNotNull(); + assertThat(actual).isNotNull(); + assertThat(actual.path()).as("Path must match").isEqualTo(expected.path()); + assertThat(actual.length()).as("Length must match").isEqualTo(expected.length()); + assertThat(actual.partitionSpecId()) + .as("Spec id must match") + .isEqualTo(expected.partitionSpecId()); + assertThat(actual.content()).as("ManifestContent must match").isEqualTo(expected.content()); + assertThat(actual.sequenceNumber()) + .as("SequenceNumber must match") + .isEqualTo(expected.sequenceNumber()); + assertThat(actual.minSequenceNumber()) + .as("MinSequenceNumber must match") + .isEqualTo(expected.minSequenceNumber()); + assertThat(actual.snapshotId()).as("Snapshot id must match").isEqualTo(expected.snapshotId()); + assertThat(actual.hasAddedFiles()) + .as("Added files flag must match") + .isEqualTo(expected.hasAddedFiles()); + assertThat(actual.addedFilesCount()) + .as("Added files count must match") + .isEqualTo(expected.addedFilesCount()); + assertThat(actual.addedRowsCount()) + .as("Added rows count must match") + .isEqualTo(expected.addedRowsCount()); + assertThat(actual.hasExistingFiles()) + .as("Existing files flag must match") + .isEqualTo(expected.hasExistingFiles()); + assertThat(actual.existingFilesCount()) + .as("Existing files count must match") + .isEqualTo(expected.existingFilesCount()); + assertThat(actual.existingRowsCount()) + .as("Existing rows count must match") + .isEqualTo(expected.existingRowsCount()); + assertThat(actual.hasDeletedFiles()) + .as("Deleted files flag must match") + .isEqualTo(expected.hasDeletedFiles()); + assertThat(actual.deletedFilesCount()) + .as("Deleted files count must match") + .isEqualTo(expected.deletedFilesCount()); + assertThat(actual.deletedRowsCount()) + .as("Deleted rows count must match") + .isEqualTo(expected.deletedRowsCount()); + + List expectedSummaries = expected.partitions(); + List actualSummaries = actual.partitions(); + assertThat(actualSummaries) + .as("PartitionFieldSummary size does not match") + .hasSameSizeAs(expectedSummaries); + for (int i = 0; i < expectedSummaries.size(); i++) { + assertThat(actualSummaries.get(i).containsNull()) + .as("Null flag in partition must match") + .isEqualTo(expectedSummaries.get(i).containsNull()); + assertThat(actualSummaries.get(i).containsNaN()) + .as("NaN flag in partition must match") + .isEqualTo(expectedSummaries.get(i).containsNaN()); + assertThat(actualSummaries.get(i).lowerBound()) + .as("Lower bounds in partition must match") + .isEqualTo(expectedSummaries.get(i).lowerBound()); + assertThat(actualSummaries.get(i).upperBound()) + .as("Upper bounds in partition must match") + .isEqualTo(expectedSummaries.get(i).upperBound()); + } + } + + public static void assertEquals(ContentFile expected, ContentFile actual) { + if (expected == actual) { + return; + } + assertThat(expected).isNotNull(); + assertThat(actual).isNotNull(); + assertThat(actual.specId()).as("SpecId").isEqualTo(expected.specId()); + assertThat(actual.content()).as("Content").isEqualTo(expected.content()); + assertThat(actual.location()).as("Location").isEqualTo(expected.location()); + assertThat(actual.format()).as("Format").isEqualTo(expected.format()); + assertThat(actual.partition().size()) + .as("Partition size") + .isEqualTo(expected.partition().size()); + for (int i = 0; i < expected.partition().size(); i++) { + assertThat(actual.partition().get(i, Object.class)) + .as("Partition data at index " + i) + .isEqualTo(expected.partition().get(i, Object.class)); + } + assertThat(actual.recordCount()).as("Record count").isEqualTo(expected.recordCount()); + assertThat(actual.fileSizeInBytes()) + .as("File size in bytes") + .isEqualTo(expected.fileSizeInBytes()); + assertThat(actual.columnSizes()).as("Column sizes").isEqualTo(expected.columnSizes()); + assertThat(actual.valueCounts()).as("Value counts").isEqualTo(expected.valueCounts()); + assertThat(actual.nullValueCounts()) + .as("Null value counts") + .isEqualTo(expected.nullValueCounts()); + assertThat(actual.lowerBounds()).as("Lower bounds").isEqualTo(expected.lowerBounds()); + assertThat(actual.upperBounds()).as("Upper bounds").isEqualTo(expected.upperBounds()); + assertThat(actual.keyMetadata()).as("Key metadata").isEqualTo(expected.keyMetadata()); + assertThat(actual.splitOffsets()).as("Split offsets").isEqualTo(expected.splitOffsets()); + assertThat(actual.equalityFieldIds()) + .as("Equality field id list") + .isEqualTo(expected.equalityFieldIds()); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java new file mode 100644 index 000000000000..0a6d5e44caa2 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java @@ -0,0 +1,331 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.file.Files; +import java.util.Map; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.table.catalog.Catalog; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.types.Row; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.thrift.TException; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestIcebergConnector extends TestBase { + + private static final String TABLE_NAME = "test_table"; + + @Parameter(index = 0) + private String catalogName; + + @Parameter(index = 1) + private Map properties; + + @Parameter(index = 2) + private boolean isStreaming; + + private volatile TableEnvironment tEnv; + + @Parameters(name = "catalogName = {0}, properties = {1}, isStreaming = {2}") + public static Iterable parameters() { + return Lists.newArrayList( + // Create iceberg table in the hadoop catalog and default database. + new Object[] { + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop"), + true + }, + new Object[] { + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-table", "not_existing_table"), + true + }, + new Object[] { + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop"), + false + }, + // Create iceberg table in the hadoop catalog and not_existing_db. + new Object[] { + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-database", "not_existing_db"), + true + }, + new Object[] { + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-database", "not_existing_db", + "catalog-table", "not_existing_table"), + true + }, + new Object[] { + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-database", "not_existing_db"), + false + }, + // Create iceberg table in the hive catalog and default database. + new Object[] { + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive"), + true + }, + new Object[] { + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-table", "not_existing_table"), + true + }, + new Object[] { + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive"), + false + }, + // Create iceberg table in the hive catalog and not_existing_db. + new Object[] { + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-database", "not_existing_db"), + true + }, + new Object[] { + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-database", "not_existing_db", + "catalog-table", "not_existing_table"), + true + }, + new Object[] { + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-database", "not_existing_db"), + false + }); + } + + @Override + protected TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + if (tEnv == null) { + EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); + if (isStreaming) { + settingsBuilder.inStreamingMode(); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); + env.enableCheckpointing(400); + env.setMaxParallelism(2); + env.setParallelism(2); + tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); + } else { + settingsBuilder.inBatchMode(); + tEnv = TableEnvironment.create(settingsBuilder.build()); + } + // Set only one parallelism. + tEnv.getConfig() + .getConfiguration() + .set(CoreOptions.DEFAULT_PARALLELISM, 1) + .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); + } + } + } + return tEnv; + } + + @AfterEach + public void after() throws TException { + sql("DROP TABLE IF EXISTS %s", TABLE_NAME); + + // Clean the created orphan databases and tables from hive-metastore. + if (isHiveCatalog()) { + HiveMetaStoreClient metaStoreClient = new HiveMetaStoreClient(hiveConf); + try { + metaStoreClient.dropTable(databaseName(), tableName()); + if (!isDefaultDatabaseName()) { + try { + metaStoreClient.dropDatabase(databaseName()); + } catch (Exception ignored) { + // Ignore + } + } + } finally { + metaStoreClient.close(); + } + } + } + + private void testCreateConnectorTable() { + Map tableProps = createTableProps(); + + // Create table under the flink's current database. + sql("CREATE TABLE %s (id BIGINT, data STRING) WITH %s", TABLE_NAME, toWithClause(tableProps)); + sql("INSERT INTO %s VALUES (1, 'AAA'), (2, 'BBB'), (3, 'CCC')", TABLE_NAME); + assertThat(sql("SELECT * FROM %s", TABLE_NAME)) + .containsExactlyInAnyOrder(Row.of(1L, "AAA"), Row.of(2L, "BBB"), Row.of(3L, "CCC")); + + FlinkCatalogFactory factory = new FlinkCatalogFactory(); + Catalog flinkCatalog = factory.createCatalog(catalogName, tableProps, new Configuration()); + assertThat(flinkCatalog.databaseExists(databaseName())).isTrue(); + assertThat(flinkCatalog.tableExists(new ObjectPath(databaseName(), tableName()))).isTrue(); + + // Drop and create it again. + sql("DROP TABLE %s", TABLE_NAME); + sql("CREATE TABLE %s (id BIGINT, data STRING) WITH %s", TABLE_NAME, toWithClause(tableProps)); + assertThat(sql("SELECT * FROM %s", TABLE_NAME)) + .containsExactlyInAnyOrder(Row.of(1L, "AAA"), Row.of(2L, "BBB"), Row.of(3L, "CCC")); + } + + @TestTemplate + public void testCreateTableUnderDefaultDatabase() { + testCreateConnectorTable(); + } + + @TestTemplate + public void testCatalogDatabaseConflictWithFlinkDatabase() { + sql("CREATE DATABASE IF NOT EXISTS `%s`", databaseName()); + sql("USE `%s`", databaseName()); + testCreateConnectorTable(); + // Ensure that the table was created under the specific database. + assertThatThrownBy( + () -> sql("CREATE TABLE `default_catalog`.`%s`.`%s`", databaseName(), TABLE_NAME)) + .isInstanceOf(org.apache.flink.table.api.ValidationException.class) + .hasMessageStartingWith("Could not execute CreateTable in path"); + } + + @TestTemplate + public void testConnectorTableInIcebergCatalog() { + // Create the catalog properties + Map catalogProps = Maps.newHashMap(); + catalogProps.put("type", "iceberg"); + if (isHiveCatalog()) { + catalogProps.put("catalog-type", "hive"); + catalogProps.put(CatalogProperties.URI, CatalogTestBase.getURI(hiveConf)); + } else { + catalogProps.put("catalog-type", "hadoop"); + } + catalogProps.put(CatalogProperties.WAREHOUSE_LOCATION, createWarehouse()); + + // Create the table properties + Map tableProps = createTableProps(); + + // Create a connector table in an iceberg catalog. + sql("CREATE CATALOG `test_catalog` WITH %s", toWithClause(catalogProps)); + try { + assertThatThrownBy( + () -> + sql( + "CREATE TABLE `test_catalog`.`%s`.`%s` (id BIGINT, data STRING) WITH %s", + FlinkCatalogFactory.DEFAULT_DATABASE_NAME, + TABLE_NAME, + toWithClause(tableProps))) + .cause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessage( + "Cannot create the table with 'connector'='iceberg' table property in an iceberg catalog, " + + "Please create table with 'connector'='iceberg' property in a non-iceberg catalog or " + + "create table without 'connector'='iceberg' related properties in an iceberg table."); + } finally { + sql("DROP CATALOG IF EXISTS `test_catalog`"); + } + } + + private Map createTableProps() { + Map tableProps = Maps.newHashMap(properties); + tableProps.put("catalog-name", catalogName); + tableProps.put(CatalogProperties.WAREHOUSE_LOCATION, createWarehouse()); + if (isHiveCatalog()) { + tableProps.put(CatalogProperties.URI, CatalogTestBase.getURI(hiveConf)); + } + return tableProps; + } + + private boolean isHiveCatalog() { + return "testhive".equalsIgnoreCase(catalogName); + } + + private boolean isDefaultDatabaseName() { + return FlinkCatalogFactory.DEFAULT_DATABASE_NAME.equalsIgnoreCase(databaseName()); + } + + private String tableName() { + return properties.getOrDefault("catalog-table", TABLE_NAME); + } + + private String databaseName() { + return properties.getOrDefault("catalog-database", "default_database"); + } + + private String createWarehouse() { + try { + return String.format( + "file://%s", + Files.createTempDirectory(temporaryDirectory, "junit").toFile().getAbsolutePath()); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java new file mode 100644 index 000000000000..5e7935be01c4 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.file.Path; +import org.apache.flink.api.common.serialization.SerializerConfigImpl; +import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.GenericManifestFile; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.ManifestFiles; +import org.apache.iceberg.ManifestWriter; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.hadoop.HadoopFileIO; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestManifestFileSerialization { + + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + required(4, "double", Types.DoubleType.get())); + + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("double").build(); + + private static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(0) + .withPartition(org.apache.iceberg.TestHelpers.Row.of(1D)) + .withPartitionPath("double=1") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 3L), // value count + ImmutableMap.of(1, 0L, 2, 2L), // null count + ImmutableMap.of(), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(4L)) // upper bounds + )) + .build(); + + private static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-2.parquet") + .withFileSizeInBytes(0) + .withPartition(org.apache.iceberg.TestHelpers.Row.of(Double.NaN)) + .withPartitionPath("double=NaN") + .withMetrics( + new Metrics( + 1L, + null, // no column sizes + ImmutableMap.of(1, 1L, 4, 1L), // value count + ImmutableMap.of(1, 0L, 2, 0L), // null count + ImmutableMap.of(4, 1L), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(1L)) // upper bounds + )) + .build(); + + private static final FileIO FILE_IO = new HadoopFileIO(new Configuration()); + + @TempDir private Path temp; + + @Test + public void testKryoSerialization() throws IOException { + KryoSerializer kryo = + new KryoSerializer<>(ManifestFile.class, new SerializerConfigImpl()); + + DataOutputSerializer outputView = new DataOutputSerializer(1024); + + ManifestFile manifest = writeManifest(FILE_A, FILE_B); + + kryo.serialize(manifest, outputView); + kryo.serialize(manifest.copy(), outputView); + kryo.serialize(GenericManifestFile.copyOf(manifest).build(), outputView); + + DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); + ManifestFile m1 = kryo.deserialize(inputView); + ManifestFile m2 = kryo.deserialize(inputView); + ManifestFile m3 = kryo.deserialize(inputView); + + TestHelpers.assertEquals(manifest, m1); + TestHelpers.assertEquals(manifest, m2); + TestHelpers.assertEquals(manifest, m3); + } + + @Test + public void testJavaSerialization() throws Exception { + ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + + ManifestFile manifest = writeManifest(FILE_A, FILE_B); + + try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { + out.writeObject(manifest); + out.writeObject(manifest.copy()); + out.writeObject(GenericManifestFile.copyOf(manifest).build()); + } + + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + for (int i = 0; i < 3; i += 1) { + Object obj = in.readObject(); + assertThat(obj).as("Should be a ManifestFile").isInstanceOf(ManifestFile.class); + TestHelpers.assertEquals(manifest, (ManifestFile) obj); + } + } + } + + private ManifestFile writeManifest(DataFile... files) throws IOException { + File manifestFile = File.createTempFile("input", "m0.avro", temp.toFile()); + assertThat(manifestFile.delete()).isTrue(); + OutputFile outputFile = FILE_IO.newOutputFile(manifestFile.getCanonicalPath()); + + ManifestWriter writer = ManifestFiles.write(SPEC, outputFile); + try { + for (DataFile file : files) { + writer.add(file); + } + } finally { + writer.close(); + } + + return writer.toManifestFile(); + } + + private static ByteBuffer longToBuffer(long value) { + return ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN).putLong(0, value); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java new file mode 100644 index 000000000000..0e7635a33e87 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Iterator; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.RecordWrapperTestBase; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.data.InternalRecordWrapper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.data.RandomRowData; +import org.apache.iceberg.util.StructLikeWrapper; + +public class TestRowDataWrapper extends RecordWrapperTestBase { + + /** + * Flink's time type has been truncated to millis seconds, so we need a customized assert method + * to check the values. + */ + @Override + public void testTime() { + generateAndValidate( + new Schema(TIME.fields()), + (message, expectedWrapper, actualWrapper) -> { + for (int pos = 0; pos < TIME.fields().size(); pos++) { + Object expected = expectedWrapper.get().get(pos, Object.class); + Object actual = actualWrapper.get().get(pos, Object.class); + if (expected == actual) { + return; + } + + assertThat(actual).isNotNull(); + assertThat(expected).isNotNull(); + + int expectedMilliseconds = (int) ((long) expected / 1000_000); + int actualMilliseconds = (int) ((long) actual / 1000_000); + assertThat(actualMilliseconds).as(message).isEqualTo(expectedMilliseconds); + } + }); + } + + @Override + protected void generateAndValidate( + Schema schema, RecordWrapperTestBase.AssertMethod assertMethod) { + int numRecords = 100; + Iterable recordList = RandomGenericData.generate(schema, numRecords, 101L); + Iterable rowDataList = RandomRowData.generate(schema, numRecords, 101L); + + InternalRecordWrapper recordWrapper = new InternalRecordWrapper(schema.asStruct()); + RowDataWrapper rowDataWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); + + Iterator actual = recordList.iterator(); + Iterator expected = rowDataList.iterator(); + + StructLikeWrapper actualWrapper = StructLikeWrapper.forType(schema.asStruct()); + StructLikeWrapper expectedWrapper = StructLikeWrapper.forType(schema.asStruct()); + for (int i = 0; i < numRecords; i++) { + assertThat(actual).hasNext(); + assertThat(expected).hasNext(); + + StructLike recordStructLike = recordWrapper.wrap(actual.next()); + StructLike rowDataStructLike = rowDataWrapper.wrap(expected.next()); + + assertMethod.assertEquals( + "Should have expected StructLike values", + expectedWrapper.set(rowDataStructLike), + actualWrapper.set(recordStructLike)); + } + + assertThat(actual).isExhausted(); + assertThat(expected).isExhausted(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java new file mode 100644 index 000000000000..a7c58e551112 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.io.File; +import org.apache.iceberg.Table; +import org.apache.iceberg.TestTables; + +public class TestTableLoader implements TableLoader { + private final File dir; + + public static TableLoader of(String dir) { + return new TestTableLoader(dir); + } + + public TestTableLoader(String dir) { + this.dir = new File(dir); + } + + @Override + public void open() {} + + @Override + public boolean isOpen() { + return true; + } + + @Override + public Table loadTable() { + return TestTables.load(dir, "test"); + } + + @Override + @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) + public TableLoader clone() { + return new TestTableLoader(dir.getAbsolutePath()); + } + + @Override + public void close() {} +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java new file mode 100644 index 000000000000..7f0e7acaa822 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.flink.TestHelpers.roundTripKryoSerialize; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.util.Map; +import org.apache.iceberg.HasTableOperations; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.MetadataTableUtils; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableOperations; +import org.apache.iceberg.TestHelpers; +import org.apache.iceberg.Transaction; +import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestTableSerialization { + private static final HadoopTables TABLES = new HadoopTables(); + + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); + + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("date").build(); + + private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); + + @TempDir private Path temp; + private Table table; + + @BeforeEach + public void initTable() throws IOException { + Map props = ImmutableMap.of("k1", "v1", "k2", "v2"); + + File tableLocation = File.createTempFile("junit", null, temp.toFile()); + assertThat(tableLocation.delete()).isTrue(); + + this.table = TABLES.create(SCHEMA, SPEC, SORT_ORDER, props, tableLocation.toString()); + } + + @Test + public void testSerializableTableKryoSerialization() throws IOException { + SerializableTable serializableTable = (SerializableTable) SerializableTable.copyOf(table); + TestHelpers.assertSerializedAndLoadedMetadata( + table, roundTripKryoSerialize(SerializableTable.class, serializableTable)); + } + + @Test + public void testSerializableMetadataTableKryoSerialization() throws IOException { + for (MetadataTableType type : MetadataTableType.values()) { + TableOperations ops = ((HasTableOperations) table).operations(); + Table metadataTable = + MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); + SerializableTable serializableMetadataTable = + (SerializableTable) SerializableTable.copyOf(metadataTable); + + TestHelpers.assertSerializedAndLoadedMetadata( + metadataTable, + roundTripKryoSerialize(SerializableTable.class, serializableMetadataTable)); + } + } + + @Test + public void testSerializableTransactionTableKryoSerialization() throws IOException { + Transaction txn = table.newTransaction(); + + txn.updateProperties().set("k1", "v1").commit(); + + Table txnTable = txn.table(); + SerializableTable serializableTxnTable = (SerializableTable) SerializableTable.copyOf(txnTable); + + TestHelpers.assertSerializedMetadata( + txnTable, roundTripKryoSerialize(SerializableTable.class, serializableTxnTable)); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java new file mode 100644 index 000000000000..b9c8ebbf179b --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java @@ -0,0 +1,523 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.actions; + +import static org.apache.iceberg.flink.SimpleDataUtil.RECORD; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.util.Collection; +import java.util.List; +import java.util.Set; +import java.util.UUID; +import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.types.Row; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileContent; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.Files; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.TestHelpers; +import org.apache.iceberg.actions.RewriteDataFilesActionResult; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.CatalogTestBase; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.Pair; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.io.TempDir; + +public class TestRewriteDataFilesAction extends CatalogTestBase { + + private static final String TABLE_NAME_UNPARTITIONED = "test_table_unpartitioned"; + private static final String TABLE_NAME_PARTITIONED = "test_table_partitioned"; + private static final String TABLE_NAME_WITH_PK = "test_table_with_pk"; + + @Parameter(index = 2) + private FileFormat format; + + @Parameter(index = 3) + private int formatVersion; + + private Table icebergTableUnPartitioned; + private Table icebergTablePartitioned; + private Table icebergTableWithPk; + + @Override + protected TableEnvironment getTableEnv() { + super.getTableEnv().getConfig().getConfiguration().set(CoreOptions.DEFAULT_PARALLELISM, 1); + return super.getTableEnv(); + } + + @Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, formatVersion={3}") + public static List parameters() { + List parameters = Lists.newArrayList(); + for (FileFormat format : + new FileFormat[] {FileFormat.AVRO, FileFormat.ORC, FileFormat.PARQUET}) { + for (Object[] catalogParams : CatalogTestBase.parameters()) { + for (int version : TestHelpers.V2_AND_ABOVE) { + String catalogName = (String) catalogParams[0]; + Namespace baseNamespace = (Namespace) catalogParams[1]; + parameters.add(new Object[] {catalogName, baseNamespace, format, version}); + } + } + } + return parameters; + } + + private @TempDir Path temp; + + @Override + @BeforeEach + public void before() { + super.before(); + sql("CREATE DATABASE %s", flinkDatabase); + sql("USE CATALOG %s", catalogName); + sql("USE %s", DATABASE); + sql( + "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s', '%s'='%s')", + TABLE_NAME_UNPARTITIONED, format.name(), TableProperties.FORMAT_VERSION, formatVersion); + icebergTableUnPartitioned = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_UNPARTITIONED)); + + sql( + "CREATE TABLE %s (id int, data varchar,spec varchar) " + + " PARTITIONED BY (data,spec) with ('write.format.default'='%s', '%s'='%s')", + TABLE_NAME_PARTITIONED, format.name(), TableProperties.FORMAT_VERSION, formatVersion); + icebergTablePartitioned = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_PARTITIONED)); + + sql( + "CREATE TABLE %s (id int, data varchar, PRIMARY KEY(`id`) NOT ENFORCED) with ('write.format.default'='%s', '%s'='%s')", + TABLE_NAME_WITH_PK, format.name(), TableProperties.FORMAT_VERSION, formatVersion); + icebergTableWithPk = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_WITH_PK)); + } + + @Override + @AfterEach + public void clean() { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME_UNPARTITIONED); + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME_PARTITIONED); + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME_WITH_PK); + dropDatabase(flinkDatabase, true); + super.clean(); + } + + @TestTemplate + public void testFailureOnV3Table() { + // Flink does not support compaction on row lineage enabled tables (V3+) + assumeThat(formatVersion).isGreaterThanOrEqualTo(3); + + assertThatThrownBy( + () -> Actions.forTable(icebergTableUnPartitioned).rewriteDataFiles().execute()) + .hasMessageContaining( + "Flink does not support compaction on row lineage enabled tables (V3+)") + .isInstanceOf(IllegalArgumentException.class); + } + + @TestTemplate + public void testRewriteDataFilesEmptyTable() throws Exception { + // Flink does not support compaction on row lineage enabled tables (V3+) + assumeThat(formatVersion).isLessThan(3); + + assertThat(icebergTableUnPartitioned.currentSnapshot()).isNull(); + Actions.forTable(icebergTableUnPartitioned).rewriteDataFiles().execute(); + assertThat(icebergTableUnPartitioned.currentSnapshot()).isNull(); + } + + @TestTemplate + public void testRewriteDataFilesUnpartitionedTable() throws Exception { + // Flink does not support compaction on row lineage enabled tables (V3+) + assumeThat(formatVersion).isLessThan(3); + + sql("INSERT INTO %s SELECT 1, 'hello'", TABLE_NAME_UNPARTITIONED); + sql("INSERT INTO %s SELECT 2, 'world'", TABLE_NAME_UNPARTITIONED); + + icebergTableUnPartitioned.refresh(); + + CloseableIterable tasks = icebergTableUnPartitioned.newScan().planFiles(); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + assertThat(dataFiles).hasSize(2); + RewriteDataFilesActionResult result = + Actions.forTable(icebergTableUnPartitioned).rewriteDataFiles().execute(); + + assertThat(result.deletedDataFiles()).hasSize(2); + assertThat(result.addedDataFiles()).hasSize(1); + + icebergTableUnPartitioned.refresh(); + + CloseableIterable tasks1 = icebergTableUnPartitioned.newScan().planFiles(); + List dataFiles1 = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + assertThat(dataFiles1).hasSize(1); + // Assert the table records as expected. + SimpleDataUtil.assertTableRecords( + icebergTableUnPartitioned, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "hello"), SimpleDataUtil.createRecord(2, "world"))); + } + + @TestTemplate + public void testRewriteDataFilesPartitionedTable() throws Exception { + // Flink does not support compaction on row lineage enabled tables (V3+) + assumeThat(formatVersion).isLessThan(3); + + sql("INSERT INTO %s SELECT 1, 'hello' ,'a'", TABLE_NAME_PARTITIONED); + sql("INSERT INTO %s SELECT 2, 'hello' ,'a'", TABLE_NAME_PARTITIONED); + sql("INSERT INTO %s SELECT 3, 'world' ,'b'", TABLE_NAME_PARTITIONED); + sql("INSERT INTO %s SELECT 4, 'world' ,'b'", TABLE_NAME_PARTITIONED); + + icebergTablePartitioned.refresh(); + + CloseableIterable tasks = icebergTablePartitioned.newScan().planFiles(); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + assertThat(dataFiles).hasSize(4); + RewriteDataFilesActionResult result = + Actions.forTable(icebergTablePartitioned).rewriteDataFiles().execute(); + + assertThat(result.deletedDataFiles()).hasSize(4); + assertThat(result.addedDataFiles()).hasSize(2); + + icebergTablePartitioned.refresh(); + + CloseableIterable tasks1 = icebergTablePartitioned.newScan().planFiles(); + List dataFiles1 = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + assertThat(dataFiles1).hasSize(2); + // Assert the table records as expected. + Schema schema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "spec", Types.StringType.get())); + + Record record = GenericRecord.create(schema); + SimpleDataUtil.assertTableRecords( + icebergTablePartitioned, + Lists.newArrayList( + record.copy("id", 1, "data", "hello", "spec", "a"), + record.copy("id", 2, "data", "hello", "spec", "a"), + record.copy("id", 3, "data", "world", "spec", "b"), + record.copy("id", 4, "data", "world", "spec", "b"))); + } + + @TestTemplate + public void testRewriteDataFilesWithFilter() throws Exception { + // Flink does not support compaction on row lineage enabled tables (V3+) + assumeThat(formatVersion).isLessThan(3); + + sql("INSERT INTO %s SELECT 1, 'hello' ,'a'", TABLE_NAME_PARTITIONED); + sql("INSERT INTO %s SELECT 2, 'hello' ,'a'", TABLE_NAME_PARTITIONED); + sql("INSERT INTO %s SELECT 3, 'world' ,'a'", TABLE_NAME_PARTITIONED); + sql("INSERT INTO %s SELECT 4, 'world' ,'b'", TABLE_NAME_PARTITIONED); + sql("INSERT INTO %s SELECT 5, 'world' ,'b'", TABLE_NAME_PARTITIONED); + + icebergTablePartitioned.refresh(); + + CloseableIterable tasks = icebergTablePartitioned.newScan().planFiles(); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + assertThat(dataFiles).hasSize(5); + RewriteDataFilesActionResult result = + Actions.forTable(icebergTablePartitioned) + .rewriteDataFiles() + .filter(Expressions.equal("spec", "a")) + .filter(Expressions.startsWith("data", "he")) + .execute(); + assertThat(result.deletedDataFiles()).hasSize(2); + assertThat(result.addedDataFiles()).hasSize(1); + + icebergTablePartitioned.refresh(); + + CloseableIterable tasks1 = icebergTablePartitioned.newScan().planFiles(); + List dataFiles1 = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + assertThat(dataFiles1).hasSize(4); + // Assert the table records as expected. + Schema schema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "spec", Types.StringType.get())); + + Record record = GenericRecord.create(schema); + SimpleDataUtil.assertTableRecords( + icebergTablePartitioned, + Lists.newArrayList( + record.copy("id", 1, "data", "hello", "spec", "a"), + record.copy("id", 2, "data", "hello", "spec", "a"), + record.copy("id", 3, "data", "world", "spec", "a"), + record.copy("id", 4, "data", "world", "spec", "b"), + record.copy("id", 5, "data", "world", "spec", "b"))); + } + + @TestTemplate + public void testRewriteLargeTableHasResiduals() throws IOException { + // Flink does not support compaction on row lineage enabled tables (V3+) + assumeThat(formatVersion).isLessThan(3); + + // all records belong to the same partition + List records1 = Lists.newArrayList(); + List records2 = Lists.newArrayList(); + List expected = Lists.newArrayList(); + for (int i = 0; i < 100; i++) { + int id = i; + String data = String.valueOf(i % 3); + if (i % 2 == 0) { + records1.add("(" + id + ",'" + data + "')"); + } else { + records2.add("(" + id + ",'" + data + "')"); + } + Record record = RECORD.copy(); + record.setField("id", id); + record.setField("data", data); + expected.add(record); + } + + sql("INSERT INTO %s values " + StringUtils.join(records1, ","), TABLE_NAME_UNPARTITIONED); + sql("INSERT INTO %s values " + StringUtils.join(records2, ","), TABLE_NAME_UNPARTITIONED); + + icebergTableUnPartitioned.refresh(); + + CloseableIterable tasks = + icebergTableUnPartitioned + .newScan() + .ignoreResiduals() + .filter(Expressions.equal("data", "0")) + .planFiles(); + for (FileScanTask task : tasks) { + assertThat(task.residual()) + .as("Residuals must be ignored") + .isEqualTo(Expressions.alwaysTrue()); + } + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + assertThat(dataFiles).hasSize(2); + Actions actions = Actions.forTable(icebergTableUnPartitioned); + + RewriteDataFilesActionResult result = + actions.rewriteDataFiles().filter(Expressions.equal("data", "0")).execute(); + assertThat(result.deletedDataFiles()).hasSize(2); + assertThat(result.addedDataFiles()).hasSize(1); + // Assert the table records as expected. + SimpleDataUtil.assertTableRecords(icebergTableUnPartitioned, expected); + } + + /** + * a test case to test avoid repeate compress + * + *

    If datafile cannot be combined to CombinedScanTask with other DataFiles, the size of the + * CombinedScanTask list size is 1, so we remove these CombinedScanTasks to avoid compressed + * repeatedly. + * + *

    In this test case,we generated 3 data files and set targetSizeInBytes greater than the + * largest file size so that it cannot be combined a CombinedScanTask with other datafiles. The + * datafile with the largest file size will not be compressed. + * + * @throws IOException IOException + */ + @TestTemplate + public void testRewriteAvoidRepeateCompress() throws IOException { + // Flink does not support compaction on row lineage enabled tables (V3+) + assumeThat(formatVersion).isLessThan(3); + + List expected = Lists.newArrayList(); + Schema schema = icebergTableUnPartitioned.schema(); + GenericAppenderFactory genericAppenderFactory = new GenericAppenderFactory(schema); + File file = File.createTempFile("junit", null, temp.toFile()); + int count = 0; + try (FileAppender fileAppender = + genericAppenderFactory.newAppender(Files.localOutput(file), format)) { + long filesize = 20000; + for (; fileAppender.length() < filesize; count++) { + Record record = SimpleDataUtil.createRecord(count, UUID.randomUUID().toString()); + fileAppender.add(record); + expected.add(record); + } + } + + DataFile dataFile = + DataFiles.builder(icebergTableUnPartitioned.spec()) + .withPath(file.getAbsolutePath()) + .withFileSizeInBytes(file.length()) + .withFormat(format) + .withRecordCount(count) + .build(); + + icebergTableUnPartitioned.newAppend().appendFile(dataFile).commit(); + + sql("INSERT INTO %s SELECT 1,'a' ", TABLE_NAME_UNPARTITIONED); + sql("INSERT INTO %s SELECT 2,'b' ", TABLE_NAME_UNPARTITIONED); + + icebergTableUnPartitioned.refresh(); + + CloseableIterable tasks = icebergTableUnPartitioned.newScan().planFiles(); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + assertThat(dataFiles).hasSize(3); + Actions actions = Actions.forTable(icebergTableUnPartitioned); + + long targetSizeInBytes = file.length() + 10; + RewriteDataFilesActionResult result = + actions + .rewriteDataFiles() + .targetSizeInBytes(targetSizeInBytes) + .splitOpenFileCost(1) + .execute(); + assertThat(result.deletedDataFiles()).hasSize(2); + assertThat(result.addedDataFiles()).hasSize(1); + icebergTableUnPartitioned.refresh(); + + CloseableIterable tasks1 = icebergTableUnPartitioned.newScan().planFiles(); + List dataFilesRewrote = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + assertThat(dataFilesRewrote).hasSize(2); + // the biggest file do not be rewrote + List rewroteDataFileNames = + dataFilesRewrote.stream().map(ContentFile::location).collect(Collectors.toList()); + assertThat(rewroteDataFileNames).contains(file.getAbsolutePath()); + + // Assert the table records as expected. + expected.add(SimpleDataUtil.createRecord(1, "a")); + expected.add(SimpleDataUtil.createRecord(2, "b")); + SimpleDataUtil.assertTableRecords(icebergTableUnPartitioned, expected); + } + + @TestTemplate + public void testRewriteNoConflictWithEqualityDeletes() throws IOException { + // Flink does not support compaction on row lineage enabled tables (V3+) + assumeThat(formatVersion).isLessThan(3); + + // Add 2 data files + sql("INSERT INTO %s SELECT 1, 'hello'", TABLE_NAME_WITH_PK); + sql("INSERT INTO %s SELECT 2, 'world'", TABLE_NAME_WITH_PK); + + // Load 2 stale tables to pass to rewrite actions + // Since the first rewrite will refresh stale1, we need another stale2 for the second rewrite + Table stale1 = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_WITH_PK)); + Table stale2 = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_WITH_PK)); + + // Add 1 data file and 1 equality-delete file + sql("INSERT INTO %s /*+ OPTIONS('upsert-enabled'='true')*/ SELECT 1, 'hi'", TABLE_NAME_WITH_PK); + + icebergTableWithPk.refresh(); + assertThat(icebergTableWithPk.currentSnapshot().sequenceNumber()) + .as("The latest sequence number should be greater than that of the stale snapshot") + .isEqualTo(stale1.currentSnapshot().sequenceNumber() + 1); + CloseableIterable tasks = icebergTableWithPk.newScan().planFiles(); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + Set deleteFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::deletes)).stream() + .flatMap(Collection::stream) + .collect(Collectors.toSet()); + assertThat(dataFiles).hasSize(3); + assertThat(deleteFiles).hasSize(1); + assertThat(Iterables.getOnlyElement(deleteFiles).content()) + .isEqualTo(FileContent.EQUALITY_DELETES); + shouldHaveDataAndFileSequenceNumbers( + TABLE_NAME_WITH_PK, + ImmutableList.of(Pair.of(1L, 1L), Pair.of(2L, 2L), Pair.of(3L, 3L), Pair.of(3L, 3L))); + + assertThatThrownBy( + () -> + Actions.forTable(stale1) + .rewriteDataFiles() + .useStartingSequenceNumber(false) + .execute(), + "Rewrite using new sequence number should fail") + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Cannot commit, found new delete for replaced data file"); + + // Rewrite using the starting sequence number should succeed + RewriteDataFilesActionResult result = + Actions.forTable(stale2).rewriteDataFiles().useStartingSequenceNumber(true).execute(); + + // Should not rewrite files from the new commit + assertThat(result.deletedDataFiles()).hasSize(2); + assertThat(result.addedDataFiles()).hasSize(1); + // The 2 older files with file-sequence-number <= 2 should be rewritten into a new file. + // The new file is the one with file-sequence-number == 4. + // The new file should use rewrite's starting-sequence-number 2 as its data-sequence-number. + shouldHaveDataAndFileSequenceNumbers( + TABLE_NAME_WITH_PK, ImmutableList.of(Pair.of(3L, 3L), Pair.of(3L, 3L), Pair.of(2L, 4L))); + + // Assert the table records as expected. + SimpleDataUtil.assertTableRecords( + icebergTableWithPk, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "hi"), SimpleDataUtil.createRecord(2, "world"))); + } + + /** + * Assert that data files and delete files in the table should have expected data sequence numbers + * and file sequence numbers + * + * @param tableName table name + * @param expectedSequenceNumbers list of {@link Pair}'s. Each {@link Pair} contains + * (expectedDataSequenceNumber, expectedFileSequenceNumber) of a file. + */ + private void shouldHaveDataAndFileSequenceNumbers( + String tableName, List> expectedSequenceNumbers) { + // "status < 2" for added or existing entries + List liveEntries = sql("SELECT * FROM %s$entries WHERE status < 2", tableName); + + List> actualSequenceNumbers = + liveEntries.stream() + .map( + row -> + Pair.of( + row.getFieldAs("sequence_number"), row.getFieldAs("file_sequence_number"))) + .collect(Collectors.toList()); + assertThat(actualSequenceNumbers).hasSameElementsAs(expectedSequenceNumbers); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java new file mode 100644 index 000000000000..cc58d9817ac6 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.RowDataConverter; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; + +public class RandomRowData { + private RandomRowData() {} + + public static Iterable generate(Schema schema, int numRecords, long seed) { + return convert(schema, RandomGenericData.generate(schema, numRecords, seed)); + } + + public static Iterable convert(Schema schema, Iterable records) { + return Iterables.transform(records, record -> RowDataConverter.convert(schema, record)); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java new file mode 100644 index 000000000000..5ffd31ad2010 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.api.common.functions.RichMapFunction; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.conversion.DataStructureConverter; +import org.apache.flink.table.data.conversion.DataStructureConverters; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.flink.types.Row; + +public class RowDataToRowMapper extends RichMapFunction { + + private final RowType rowType; + + private transient DataStructureConverter converter; + + public RowDataToRowMapper(RowType rowType) { + this.rowType = rowType; + } + + @Override + public void open(OpenContext parameters) throws Exception { + this.converter = + DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(rowType)); + } + + @Override + public Row map(RowData value) throws Exception { + return (Row) converter.toExternal(value); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java new file mode 100644 index 000000000000..45b679eeda73 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.data.DataTestBase; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.avro.DataWriter; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.inmemory.InMemoryOutputFile; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +public class TestFlinkAvroReaderWriter extends DataTestBase { + + private static final int NUM_RECORDS = 100; + + @Override + protected boolean supportsDefaultValues() { + return true; + } + + @Override + protected boolean supportsUnknown() { + return true; + } + + @Override + protected boolean supportsTimestampNanos() { + return true; + } + + @Override + protected void writeAndValidate(Schema schema) throws IOException { + List expectedRecords = RandomGenericData.generate(schema, NUM_RECORDS, 1991L); + writeAndValidate(schema, expectedRecords); + } + + @Override + protected void writeAndValidate(Schema schema, List expectedRecords) throws IOException { + writeAndValidate(schema, schema, expectedRecords); + } + + @Override + protected void writeAndValidate(Schema writeSchema, Schema expectedSchema) throws IOException { + List expectedRecords = RandomGenericData.generate(writeSchema, NUM_RECORDS, 1991L); + writeAndValidate(writeSchema, expectedSchema, expectedRecords); + } + + protected void writeAndValidate( + Schema writeSchema, Schema expectedSchema, List expectedRecords) throws IOException { + List expectedRows = + Lists.newArrayList(RandomRowData.convert(writeSchema, expectedRecords)); + + OutputFile outputFile = new InMemoryOutputFile(); + + // Write the expected records into AVRO file, then read them into RowData and assert with the + // expected Record list. + try (FileAppender writer = + Avro.write(outputFile).schema(writeSchema).createWriterFunc(DataWriter::create).build()) { + writer.addAll(expectedRecords); + } + + RowType flinkSchema = FlinkSchemaUtil.convert(expectedSchema); + + try (CloseableIterable reader = + Avro.read(outputFile.toInputFile()) + .project(expectedSchema) + .createResolvingReader(FlinkPlannedAvroReader::create) + .build()) { + Iterator expected = expectedRecords.iterator(); + Iterator rows = reader.iterator(); + for (int i = 0; i < expectedRecords.size(); i++) { + assertThat(rows).hasNext(); + TestHelpers.assertRowData( + expectedSchema.asStruct(), flinkSchema, expected.next(), rows.next()); + } + assertThat(rows).isExhausted(); + } + + OutputFile file = new InMemoryOutputFile(); + + // Write the expected RowData into AVRO file, then read them into Record and assert with the + // expected RowData list. + try (FileAppender writer = + Avro.write(file) + .schema(writeSchema) + .createWriterFunc(ignore -> new FlinkAvroWriter(flinkSchema)) + .build()) { + writer.addAll(expectedRows); + } + + try (CloseableIterable reader = + Avro.read(file.toInputFile()) + .project(expectedSchema) + .createResolvingReader(FlinkPlannedAvroReader::create) + .build()) { + Iterator expected = expectedRecords.iterator(); + Iterator rows = reader.iterator(); + for (int i = 0; i < expectedRecords.size(); i += 1) { + assertThat(rows).hasNext(); + TestHelpers.assertRowData( + expectedSchema.asStruct(), flinkSchema, expected.next(), rows.next()); + } + assertThat(rows).isExhausted(); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java new file mode 100644 index 000000000000..4a70802f2a2e --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Files; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.DataTestBase; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.orc.GenericOrcReader; +import org.apache.iceberg.data.orc.GenericOrcWriter; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.orc.ORC; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +public class TestFlinkOrcReaderWriter extends DataTestBase { + private static final int NUM_RECORDS = 100; + + /** Orc writers don't have notion of non-null / required fields. */ + @Override + protected boolean allowsWritingNullValuesForRequiredFields() { + return true; + } + + @Override + protected void writeAndValidate(Schema schema) throws IOException { + List expectedRecords = RandomGenericData.generate(schema, NUM_RECORDS, 1990L); + writeAndValidate(schema, expectedRecords); + } + + @Override + protected void writeAndValidate(Schema schema, List expectedRecords) throws IOException { + RowType flinkSchema = FlinkSchemaUtil.convert(schema); + List expectedRows = Lists.newArrayList(RandomRowData.convert(schema, expectedRecords)); + + File recordsFile = File.createTempFile("junit", null, temp.toFile()); + assertThat(recordsFile.delete()).isTrue(); + + // Write the expected records into ORC file, then read them into RowData and assert with the + // expected Record list. + try (FileAppender writer = + ORC.write(Files.localOutput(recordsFile)) + .schema(schema) + .createWriterFunc(GenericOrcWriter::buildWriter) + .build()) { + writer.addAll(expectedRecords); + } + + try (CloseableIterable reader = + ORC.read(Files.localInput(recordsFile)) + .project(schema) + .createReaderFunc(type -> new FlinkOrcReader(schema, type)) + .build()) { + Iterator expected = expectedRecords.iterator(); + Iterator rows = reader.iterator(); + for (int i = 0; i < expectedRecords.size(); i++) { + assertThat(rows).hasNext(); + TestHelpers.assertRowData(schema.asStruct(), flinkSchema, expected.next(), rows.next()); + } + assertThat(rows).isExhausted(); + } + + File rowDataFile = File.createTempFile("junit", null, temp.toFile()); + assertThat(rowDataFile.delete()).isTrue(); + + // Write the expected RowData into ORC file, then read them into Record and assert with the + // expected RowData list. + RowType rowType = FlinkSchemaUtil.convert(schema); + try (FileAppender writer = + ORC.write(Files.localOutput(rowDataFile)) + .schema(schema) + .createWriterFunc((iSchema, typeDesc) -> FlinkOrcWriter.buildWriter(rowType, iSchema)) + .build()) { + writer.addAll(expectedRows); + } + + try (CloseableIterable reader = + ORC.read(Files.localInput(rowDataFile)) + .project(schema) + .createReaderFunc(type -> GenericOrcReader.buildReader(schema, type)) + .build()) { + Iterator expected = expectedRows.iterator(); + Iterator records = reader.iterator(); + for (int i = 0; i < expectedRecords.size(); i += 1) { + assertThat(records.hasNext()).isTrue(); + TestHelpers.assertRowData(schema.asStruct(), flinkSchema, records.next(), expected.next()); + } + assertThat(records).isExhausted(); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java new file mode 100644 index 000000000000..e6781356f711 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java @@ -0,0 +1,268 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.parquet.schema.Types.primitive; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Iterator; +import java.util.List; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.GenericRecordBuilder; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.Files; +import org.apache.iceberg.Schema; +import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.data.DataTestBase; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.parquet.GenericParquetWriter; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.parquet.ParquetValueReader; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.parquet.avro.AvroParquetWriter; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.junit.jupiter.api.Test; + +public class TestFlinkParquetReader extends DataTestBase { + private static final int NUM_RECORDS = 100; + + @Override + protected boolean supportsDefaultValues() { + return true; + } + + @Override + protected boolean supportsUnknown() { + return true; + } + + @Override + protected boolean supportsTimestampNanos() { + return true; + } + + @Test + public void testBuildReader() { + MessageType fileSchema = + new MessageType( + "test", + // 0: required(100, "id", LongType.get()) + primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .id(100) + .named("id"), + // 1: optional(101, "data", Types.StringType.get()) + primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.OPTIONAL) + .id(101) + .named("data"), + // 2: required(102, "b", Types.BooleanType.get()) + primitive(PrimitiveType.PrimitiveTypeName.BOOLEAN, Type.Repetition.REQUIRED) + .id(102) + .named("b"), + // 3: optional(103, "i", Types.IntegerType.get()) + primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.OPTIONAL) + .id(103) + .named("i"), + // 4: optional(105, "f", Types.FloatType.get()) + primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .id(104) + .named("l"), + // 5: required(106, "d", Types.DoubleType.get()) + primitive(PrimitiveType.PrimitiveTypeName.FLOAT, Type.Repetition.OPTIONAL) + .id(105) + .named("f"), + // 6: required(106, "d", Types.DoubleType.get()) + primitive(PrimitiveType.PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(106) + .named("d"), + // 7: optional(107, "date", Types.DateType.get()) + primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.OPTIONAL) + .id(107) + .as(LogicalTypeAnnotation.dateType()) + .named("date"), + // 8: required(108, "ts_tz", Types.TimestampType.withZone()) + primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .id(108) + .as( + LogicalTypeAnnotation.timestampType( + true, LogicalTypeAnnotation.TimeUnit.MICROS)) + .named("ts_tz"), + // 9: required(109, "ts", Types.TimestampType.withoutZone()) + primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .id(109) + .as( + LogicalTypeAnnotation.timestampType( + false, LogicalTypeAnnotation.TimeUnit.MICROS)) + .named("ts"), + // 10: required(110, "s", Types.StringType.get()) + primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .id(110) + .as(LogicalTypeAnnotation.stringType()) + .named("s"), + // 11: required(112, "fixed", Types.FixedType.ofLength(7)) + primitive( + PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, Type.Repetition.REQUIRED) + .id(112) + .length(7) + .named("f"), + // 12: optional(113, "bytes", Types.BinaryType.get()) + primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.OPTIONAL) + .id(113) + .named("bytes"), + // 13: required(114, "dec_9_0", Types.DecimalType.of(9, 0)) + primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .id(114) + .as(LogicalTypeAnnotation.decimalType(0, 9)) + .named("dec_9_0"), + // 14: required(115, "dec_11_2", Types.DecimalType.of(11, 2)) + primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .id(115) + .as(LogicalTypeAnnotation.decimalType(2, 11)) + .named("dec_11_2"), + // 15: required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision + primitive( + PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, Type.Repetition.REQUIRED) + .id(116) + .length(16) + .as(LogicalTypeAnnotation.decimalType(10, 38)) + .named("dec_38_10"), + // 16: required(117, "time", Types.TimeType.get()) + primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.OPTIONAL) + .id(117) + .as(LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MICROS)) + .named("time")); + ParquetValueReader reader = + FlinkParquetReaders.buildReader(new Schema(SUPPORTED_PRIMITIVES.fields()), fileSchema); + + assertThat(reader.columns()).hasSameSizeAs(SUPPORTED_PRIMITIVES.fields()); + } + + @Test + public void testTwoLevelList() throws IOException { + Schema schema = + new Schema( + optional(1, "arraybytes", Types.ListType.ofRequired(3, Types.BinaryType.get())), + optional(2, "topbytes", Types.BinaryType.get())); + org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(schema.asStruct()); + + File testFile = File.createTempFile("junit", null, temp.toFile()); + assertThat(testFile.delete()).isTrue(); + + ParquetWriter writer = + AvroParquetWriter.builder(new Path(testFile.toURI())) + .withDataModel(GenericData.get()) + .withSchema(avroSchema) + .config("parquet.avro.add-list-element-records", "true") + .config("parquet.avro.write-old-list-structure", "true") + .build(); + + GenericRecordBuilder recordBuilder = new GenericRecordBuilder(avroSchema); + List expectedByteList = Lists.newArrayList(); + byte[] expectedByte = {0x00, 0x01}; + ByteBuffer expectedBinary = ByteBuffer.wrap(expectedByte); + expectedByteList.add(expectedBinary); + recordBuilder.set("arraybytes", expectedByteList); + recordBuilder.set("topbytes", expectedBinary); + GenericData.Record expectedRecord = recordBuilder.build(); + + writer.write(expectedRecord); + writer.close(); + + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(type -> FlinkParquetReaders.buildReader(schema, type)) + .build()) { + Iterator rows = reader.iterator(); + assertThat(rows).hasNext(); + RowData rowData = rows.next(); + assertThat(rowData.getArray(0).getBinary(0)).isEqualTo(expectedByte); + assertThat(rowData.getBinary(1)).isEqualTo(expectedByte); + assertThat(rows).isExhausted(); + } + } + + private void writeAndValidate( + Iterable iterable, Schema writeSchema, Schema expectedSchema) throws IOException { + File testFile = File.createTempFile("junit", null, temp.toFile()); + assertThat(testFile.delete()).isTrue(); + + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .schema(writeSchema) + .createWriterFunc(GenericParquetWriter::create) + .build()) { + writer.addAll(iterable); + } + + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(expectedSchema) + .createReaderFunc(type -> FlinkParquetReaders.buildReader(expectedSchema, type)) + .build()) { + Iterator expected = iterable.iterator(); + Iterator rows = reader.iterator(); + LogicalType rowType = FlinkSchemaUtil.convert(writeSchema); + for (int i = 0; i < NUM_RECORDS; i += 1) { + assertThat(rows).hasNext(); + TestHelpers.assertRowData(writeSchema.asStruct(), rowType, expected.next(), rows.next()); + } + assertThat(rows).isExhausted(); + } + } + + @Override + protected void writeAndValidate(Schema schema) throws IOException { + writeAndValidate(RandomGenericData.generate(schema, NUM_RECORDS, 19981), schema, schema); + writeAndValidate( + RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124), + schema, + schema); + writeAndValidate( + RandomGenericData.generateFallbackRecords(schema, NUM_RECORDS, 21124, NUM_RECORDS / 20), + schema, + schema); + } + + @Override + protected void writeAndValidate(Schema writeSchema, Schema expectedSchema) throws IOException { + writeAndValidate(RandomGenericData.generate(writeSchema, 100, 0L), writeSchema, expectedSchema); + } + + @Override + protected void writeAndValidate(Schema schema, List expectedData) throws IOException { + writeAndValidate(expectedData, schema, schema); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java new file mode 100644 index 000000000000..d181d3351410 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.Iterator; +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.binary.BinaryRowData; +import org.apache.flink.table.runtime.typeutils.RowDataSerializer; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.DataTestBase; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.parquet.GenericParquetReaders; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataConverter; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.inmemory.InMemoryOutputFile; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.io.TempDir; + +public class TestFlinkParquetWriter extends DataTestBase { + private static final int NUM_RECORDS = 100; + + @TempDir private Path temp; + + @Override + protected boolean supportsUnknown() { + return true; + } + + @Override + protected boolean supportsTimestampNanos() { + return true; + } + + private void writeAndValidate(Iterable iterable, Schema schema) throws IOException { + OutputFile outputFile = new InMemoryOutputFile(); + + LogicalType logicalType = FlinkSchemaUtil.convert(schema); + + try (FileAppender writer = + Parquet.write(outputFile) + .schema(schema) + .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(logicalType, msgType)) + .build()) { + writer.addAll(iterable); + } + + try (CloseableIterable reader = + Parquet.read(outputFile.toInputFile()) + .project(schema) + .createReaderFunc(msgType -> GenericParquetReaders.buildReader(schema, msgType)) + .build()) { + Iterator expected = iterable.iterator(); + Iterator actual = reader.iterator(); + LogicalType rowType = FlinkSchemaUtil.convert(schema); + for (int i = 0; i < NUM_RECORDS; i += 1) { + assertThat(actual).hasNext(); + TestHelpers.assertRowData(schema.asStruct(), rowType, actual.next(), expected.next()); + } + assertThat(actual).isExhausted(); + } + } + + @Override + protected void writeAndValidate(Schema schema) throws IOException { + writeAndValidate(RandomRowData.generate(schema, NUM_RECORDS, 19981), schema); + + writeAndValidate( + RandomRowData.convert( + schema, + RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124)), + schema); + + writeAndValidate( + RandomRowData.convert( + schema, + RandomGenericData.generateFallbackRecords( + schema, NUM_RECORDS, 21124, NUM_RECORDS / 20)), + schema); + } + + @Override + protected void writeAndValidate(Schema schema, List expectedData) throws IOException { + RowDataSerializer rowDataSerializer = new RowDataSerializer(FlinkSchemaUtil.convert(schema)); + List binaryRowList = Lists.newArrayList(); + for (Record record : expectedData) { + RowData rowData = RowDataConverter.convert(schema, record); + BinaryRowData binaryRow = rowDataSerializer.toBinaryRow(rowData); + binaryRowList.add(binaryRow); + } + + writeAndValidate(binaryRowList, schema); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java new file mode 100644 index 000000000000..4e5b38ffb026 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java @@ -0,0 +1,593 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatNoException; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.List; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.DataGenerator; +import org.apache.iceberg.flink.DataGenerators; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.StructProjection; +import org.junit.jupiter.api.Test; + +public class TestRowDataProjection { + @Test + public void testNullRootRowData() { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + RowDataProjection projection = RowDataProjection.create(schema, schema.select("id")); + + assertThatThrownBy(() -> projection.wrap(null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid row data: null"); + } + + @Test + public void testFullProjection() { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + generateAndValidate(schema, schema); + + GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); + testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); + } + + @Test + public void testReorderedFullProjection() { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + Schema reordered = + new Schema( + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(0, "id", Types.LongType.get())); + + generateAndValidate(schema, reordered); + + GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); + testEqualsAndHashCode(schema, reordered, rowData, copyRowData, otherRowData); + } + + @Test + public void testBasicProjection() { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); + Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); + generateAndValidate(schema, idOnly); + generateAndValidate(schema, dataOnly); + + GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); + testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode(schema, dataOnly, rowData, copyRowData, otherRowData); + } + + @Test + public void testEmptyProjection() { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + generateAndValidate(schema, schema.select()); + + GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); + testEqualsAndHashCode(schema, schema.select(), rowData, copyRowData, otherRowData, true); + } + + @Test + public void testRename() { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + Schema renamed = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "renamed", Types.StringType.get())); + generateAndValidate(schema, renamed); + + GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); + testEqualsAndHashCode(schema, renamed, rowData, copyRowData, otherRowData); + } + + @Test + public void testNestedProjection() { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); + + GenericRowData rowData = GenericRowData.of(1L, GenericRowData.of(1.0f, 1.0f)); + GenericRowData copyRowData = GenericRowData.of(1L, GenericRowData.of(1.0f, 1.0f)); + GenericRowData otherRowData = GenericRowData.of(2L, GenericRowData.of(2.0f, 2.0f)); + + GenericRowData rowDataNullStruct = GenericRowData.of(1L, null); + GenericRowData copyRowDataNullStruct = GenericRowData.of(1L, null); + GenericRowData otherRowDataNullStruct = GenericRowData.of(2L, null); + + // Project id only. + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); + assertThat(idOnly.columns()).isNotEmpty(); + generateAndValidate(schema, idOnly); + testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode( + schema, idOnly, rowDataNullStruct, copyRowDataNullStruct, otherRowDataNullStruct); + + // Project lat only. + Schema latOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); + assertThat(latOnly.columns()).isNotEmpty(); + generateAndValidate(schema, latOnly); + testEqualsAndHashCode(schema, latOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode( + schema, latOnly, rowDataNullStruct, copyRowDataNullStruct, otherRowDataNullStruct, true); + + // Project long only. + Schema longOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); + assertThat(longOnly.columns()).isNotEmpty(); + generateAndValidate(schema, longOnly); + testEqualsAndHashCode(schema, longOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode( + schema, longOnly, rowDataNullStruct, copyRowDataNullStruct, otherRowDataNullStruct, true); + + // Project location. + Schema locationOnly = schema.select("location"); + assertThat(locationOnly.columns()).isNotEmpty(); + generateAndValidate(schema, locationOnly); + testEqualsAndHashCode(schema, locationOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode( + schema, + locationOnly, + rowDataNullStruct, + copyRowDataNullStruct, + otherRowDataNullStruct, + true); + } + + @Test + public void testPrimitivesFullProjection() { + DataGenerator dataGenerator = new DataGenerators.Primitives(); + Schema schema = dataGenerator.icebergSchema(); + generateAndValidate(schema, schema); + + GenericRowData rowData = dataGenerator.generateFlinkRowData(); + GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); + GenericRowData otherRowData = dataGenerator.generateFlinkRowData(); + // modify the string field value (position 6) + otherRowData.setField(6, StringData.fromString("foo_bar")); + testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); + + GenericRowData rowDataNullOptionalFields = dataGenerator.generateFlinkRowData(); + setOptionalFieldsNullForPrimitives(rowDataNullOptionalFields); + GenericRowData copyRowDataNullOptionalFields = dataGenerator.generateFlinkRowData(); + setOptionalFieldsNullForPrimitives(copyRowDataNullOptionalFields); + GenericRowData otherRowDataNullOptionalFields = dataGenerator.generateFlinkRowData(); + // modify the string field value (position 6) + otherRowDataNullOptionalFields.setField(6, StringData.fromString("foo_bar")); + setOptionalFieldsNullForPrimitives(otherRowData); + testEqualsAndHashCode( + schema, + schema, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + } + + private void setOptionalFieldsNullForPrimitives(GenericRowData rowData) { + // fields from [1, 5] range are optional + for (int pos = 1; pos <= 5; ++pos) { + rowData.setField(pos, null); + } + } + + @Test + public void testMapOfPrimitivesProjection() { + DataGenerator dataGenerator = new DataGenerators.MapOfPrimitives(); + Schema schema = dataGenerator.icebergSchema(); + + // Project id only. + Schema idOnly = schema.select("row_id"); + assertThat(idOnly.columns()).isNotEmpty(); + generateAndValidate(schema, idOnly); + + // Project map only. + Schema mapOnly = schema.select("map_of_primitives"); + assertThat(mapOnly.columns()).isNotEmpty(); + generateAndValidate(schema, mapOnly); + + // Project all. + generateAndValidate(schema, schema); + + GenericRowData rowData = dataGenerator.generateFlinkRowData(); + GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); + // modify the map field value + GenericRowData otherRowData = + GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericMapData( + ImmutableMap.of(StringData.fromString("foo"), 1, StringData.fromString("bar"), 2))); + testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData, true); + testEqualsAndHashCode(schema, mapOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); + + GenericRowData rowDataNullOptionalFields = + GenericRowData.of(StringData.fromString("row_id_value"), null); + GenericRowData copyRowDataNullOptionalFields = + GenericRowData.of(StringData.fromString("row_id_value"), null); + // modify the map field value + GenericRowData otherRowDataNullOptionalFields = + GenericRowData.of(StringData.fromString("other_row_id_value"), null); + testEqualsAndHashCode( + schema, + idOnly, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + testEqualsAndHashCode( + schema, + mapOnly, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields, + true); + testEqualsAndHashCode( + schema, + schema, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + } + + @Test + public void testMapOfStructStructProjection() { + DataGenerator dataGenerator = new DataGenerators.MapOfStructStruct(); + Schema schema = dataGenerator.icebergSchema(); + + // Project id only. + Schema idOnly = schema.select("row_id"); + assertThat(idOnly.columns()).isNotEmpty(); + generateAndValidate(schema, idOnly); + + // Project map only. + Schema mapOnly = schema.select("map"); + assertThat(mapOnly.columns()).isNotEmpty(); + generateAndValidate(schema, mapOnly); + + // Project all. + generateAndValidate(schema, schema); + + // Project partial map key. + Schema partialMapKey = + new Schema( + Types.NestedField.optional( + 2, + "map", + Types.MapType.ofOptional( + 101, + 102, + Types.StructType.of( + Types.NestedField.required(201, "key", Types.LongType.get())), + Types.StructType.of( + Types.NestedField.required(203, "value", Types.LongType.get()), + Types.NestedField.required(204, "valueData", Types.StringType.get()))))); + assertThatThrownBy(() -> generateAndValidate(schema, partialMapKey)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot project a partial map key or value struct."); + + // Project partial map key. + Schema partialMapValue = + new Schema( + Types.NestedField.optional( + 2, + "map", + Types.MapType.ofOptional( + 101, + 102, + Types.StructType.of( + Types.NestedField.required(201, "key", Types.LongType.get()), + Types.NestedField.required(202, "keyData", Types.StringType.get())), + Types.StructType.of( + Types.NestedField.required(203, "value", Types.LongType.get()))))); + assertThatThrownBy(() -> generateAndValidate(schema, partialMapValue)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot project a partial map key or value struct."); + + GenericRowData rowData = dataGenerator.generateFlinkRowData(); + GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); + // modify the map field value + GenericRowData otherRowData = + GenericRowData.of( + StringData.fromString("other_row_id_value"), + new GenericMapData( + ImmutableMap.of( + GenericRowData.of(1L, StringData.fromString("other_key_data")), + GenericRowData.of(1L, StringData.fromString("other_value_data"))))); + testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode(schema, mapOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); + + GenericRowData rowDataNullOptionalFields = + GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericMapData( + ImmutableMap.of(GenericRowData.of(1L, null), GenericRowData.of(1L, null)))); + GenericRowData copyRowDataNullOptionalFields = + GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericMapData( + ImmutableMap.of(GenericRowData.of(1L, null), GenericRowData.of(1L, null)))); + // modify the map field value + GenericRowData otherRowDataNullOptionalFields = + GenericRowData.of( + StringData.fromString("other_row_id_value"), + new GenericMapData( + ImmutableMap.of(GenericRowData.of(2L, null), GenericRowData.of(2L, null)))); + testEqualsAndHashCode( + schema, + idOnly, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + testEqualsAndHashCode( + schema, + mapOnly, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + testEqualsAndHashCode( + schema, + schema, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + } + + @Test + public void testArrayOfPrimitiveProjection() { + DataGenerator dataGenerator = new DataGenerators.ArrayOfPrimitive(); + Schema schema = dataGenerator.icebergSchema(); + + // Project id only. + Schema idOnly = schema.select("row_id"); + assertThat(idOnly.columns()).isNotEmpty(); + generateAndValidate(schema, idOnly); + + // Project list only. + Schema arrayOnly = schema.select("array_of_int"); + assertThat(arrayOnly.columns()).isNotEmpty(); + generateAndValidate(schema, arrayOnly); + + // Project all. + generateAndValidate(schema, schema); + + GenericRowData rowData = dataGenerator.generateFlinkRowData(); + GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); + // modify the map field value + GenericRowData otherRowData = + GenericRowData.of( + StringData.fromString("other_row_id_value"), + new GenericArrayData(new Integer[] {4, 5, 6})); + testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode(schema, arrayOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); + + GenericRowData rowDataNullOptionalFields = + GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericArrayData(new Integer[] {1, null, 3})); + GenericRowData copyRowDataNullOptionalFields = + GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericArrayData(new Integer[] {1, null, 3})); + // modify the map field value + GenericRowData otherRowDataNullOptionalFields = + GenericRowData.of( + StringData.fromString("other_row_id_value"), + new GenericArrayData(new Integer[] {4, null, 6})); + testEqualsAndHashCode( + schema, + idOnly, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + testEqualsAndHashCode( + schema, + arrayOnly, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + testEqualsAndHashCode( + schema, + schema, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + } + + @Test + public void testArrayOfStructProjection() { + DataGenerator dataGenerator = new DataGenerators.ArrayOfStruct(); + Schema schema = dataGenerator.icebergSchema(); + + // Project id only. + Schema idOnly = schema.select("row_id"); + assertThat(idOnly.columns()).isNotEmpty(); + generateAndValidate(schema, idOnly); + + // Project list only. + Schema arrayOnly = schema.select("array_of_struct"); + assertThat(arrayOnly.columns()).isNotEmpty(); + generateAndValidate(schema, arrayOnly); + + // Project all. + generateAndValidate(schema, schema); + + // Project partial list value. + Schema partialList = + new Schema( + Types.NestedField.optional( + 2, + "array_of_struct", + Types.ListType.ofOptional( + 101, + Types.StructType.of( + Types.NestedField.required(202, "name", Types.StringType.get()))))); + + assertThatThrownBy(() -> generateAndValidate(schema, partialList)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot project a partial list element struct."); + + GenericRowData rowData = dataGenerator.generateFlinkRowData(); + GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); + // modify the map field value + GenericRowData otherRowData = + GenericRowData.of( + StringData.fromString("row_id_value"), new GenericArrayData(new Integer[] {4, 5, 6})); + testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); + + GenericRowData rowDataNullOptionalFields = + GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericArrayData(new Integer[] {1, null, 3})); + GenericRowData copyRowDataNullOptionalFields = + GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericArrayData(new Integer[] {1, null, 3})); + // modify the map field value + GenericRowData otherRowDataNullOptionalFields = + GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericArrayData(new Integer[] {4, null, 6})); + testEqualsAndHashCode( + schema, + schema, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + } + + private void generateAndValidate(Schema schema, Schema projectSchema) { + int numRecords = 100; + List recordList = RandomGenericData.generate(schema, numRecords, 102L); + List rowDataList = + Lists.newArrayList(RandomRowData.generate(schema, numRecords, 102L).iterator()); + assertThat(rowDataList).hasSize(recordList.size()); + + StructProjection structProjection = StructProjection.create(schema, projectSchema); + RowDataProjection rowDataProjection = RowDataProjection.create(schema, projectSchema); + + for (int i = 0; i < numRecords; i++) { + StructLike expected = structProjection.wrap(recordList.get(i)); + RowData projected = rowDataProjection.wrap(rowDataList.get(i)); + TestHelpers.assertRowData(projectSchema, expected, projected); + + assertThat(projected).isEqualTo(projected); + assertThat(projected).hasSameHashCodeAs(projected); + // make sure toString doesn't throw NPE for null values + assertThatNoException().isThrownBy(projected::toString); + } + } + + private void testEqualsAndHashCode( + Schema schema, + Schema projectionSchema, + RowData rowData, + RowData copyRowData, + RowData otherRowData) { + testEqualsAndHashCode(schema, projectionSchema, rowData, copyRowData, otherRowData, false); + } + + /** + * @param isOtherRowDataSameAsRowData sometimes projection on otherRowData can result in the same + * RowData, e.g. due to empty projection or null struct + */ + private void testEqualsAndHashCode( + Schema schema, + Schema projectionSchema, + RowData rowData, + RowData copyRowData, + RowData otherRowData, + boolean isOtherRowDataSameAsRowData) { + RowDataProjection projection = RowDataProjection.create(schema, projectionSchema); + RowDataProjection copyProjection = RowDataProjection.create(schema, projectionSchema); + RowDataProjection otherProjection = RowDataProjection.create(schema, projectionSchema); + + assertThat(projection.wrap(rowData)).isEqualTo(copyProjection.wrap(copyRowData)); + assertThat(projection.wrap(rowData)).hasSameHashCodeAs(copyProjection.wrap(copyRowData)); + + if (isOtherRowDataSameAsRowData) { + assertThat(projection.wrap(rowData)).isEqualTo(otherProjection.wrap(otherRowData)); + assertThat(projection.wrap(rowData)).hasSameHashCodeAs(otherProjection.wrap(otherRowData)); + } else { + assertThat(projection.wrap(rowData)).isNotEqualTo(otherProjection.wrap(otherRowData)); + assertThat(projection.wrap(rowData)) + .doesNotHaveSameHashCodeAs(otherProjection.wrap(otherRowData)); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java new file mode 100644 index 000000000000..3b98939ea167 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java @@ -0,0 +1,582 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.withPrecision; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.Files; +import org.apache.iceberg.Schema; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.types.Comparators; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestRowProjection { + + @TempDir private Path temp; + + private RowData writeAndRead(String desc, Schema writeSchema, Schema readSchema, RowData row) + throws IOException { + File file = File.createTempFile("junit", desc + ".avro", temp.toFile()); + assertThat(file.delete()).isTrue(); + + try (FileAppender appender = + Avro.write(Files.localOutput(file)) + .schema(writeSchema) + .createWriterFunc(ignore -> new FlinkAvroWriter(FlinkSchemaUtil.convert(writeSchema))) + .build()) { + appender.add(row); + } + + Avro.ReadBuilder builder = + Avro.read(Files.localInput(file)) + .project(readSchema) + .createResolvingReader(FlinkPlannedAvroReader::create); + + Iterable records = builder.build(); + + return Iterables.getOnlyElement(records); + } + + @Test + public void testFullProjection() throws Exception { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + RowData row = GenericRowData.of(34L, StringData.fromString("test")); + + RowData projected = writeAndRead("full_projection", schema, schema, row); + + assertThat(projected.getLong(0)).isEqualTo(34); + assertThat(projected.getString(1)).asString().isEqualTo("test"); + } + + @Test + public void testSpecialCharacterProjection() throws Exception { + Schema schema = + new Schema( + Types.NestedField.required(0, "user id", Types.LongType.get()), + Types.NestedField.optional(1, "data%0", Types.StringType.get())); + + RowData row = GenericRowData.of(34L, StringData.fromString("test")); + + RowData full = writeAndRead("special_chars", schema, schema, row); + + assertThat(full.getLong(0)).isEqualTo(34L); + assertThat(full.getString(1)).asString().isEqualTo("test"); + + RowData projected = writeAndRead("special_characters", schema, schema.select("data%0"), full); + + assertThat(projected.getArity()).isEqualTo(1); + assertThat(projected.getString(0)).asString().isEqualTo("test"); + } + + @Test + public void testReorderedFullProjection() throws Exception { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + RowData row = GenericRowData.of(34L, StringData.fromString("test")); + + Schema reordered = + new Schema( + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(0, "id", Types.LongType.get())); + + RowData projected = writeAndRead("full_projection", schema, reordered, row); + + assertThat(projected.getString(0)).asString().isEqualTo("test"); + assertThat(projected.getLong(1)).isEqualTo(34); + } + + @Test + public void testReorderedProjection() throws Exception { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + RowData row = GenericRowData.of(34L, StringData.fromString("test")); + + Schema reordered = + new Schema( + Types.NestedField.optional(2, "missing_1", Types.StringType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.optional(3, "missing_2", Types.LongType.get())); + + RowData projected = writeAndRead("full_projection", schema, reordered, row); + + assertThat(projected.isNullAt(0)).isTrue(); + assertThat(projected.getString(1)).asString().isEqualTo("test"); + assertThat(projected.isNullAt(2)).isTrue(); + } + + @Test + public void testRenamedAddedField() throws Exception { + Schema schema = + new Schema( + Types.NestedField.required(1, "a", Types.LongType.get()), + Types.NestedField.required(2, "b", Types.LongType.get()), + Types.NestedField.required(3, "d", Types.LongType.get())); + + RowData row = GenericRowData.of(100L, 200L, 300L); + + Schema renamedAdded = + new Schema( + Types.NestedField.optional(1, "a", Types.LongType.get()), + Types.NestedField.optional(2, "b", Types.LongType.get()), + Types.NestedField.optional(3, "c", Types.LongType.get()), + Types.NestedField.optional(4, "d", Types.LongType.get())); + + RowData projected = writeAndRead("rename_and_add_column_projection", schema, renamedAdded, row); + assertThat(projected.getLong(0)) + .as("Should contain the correct value in column 1") + .isEqualTo(100L); + assertThat(projected.getLong(1)) + .as("Should contain the correct value in column 2") + .isEqualTo(200L); + assertThat(projected.getLong(2)) + .as("Should contain the correct value in column 1") + .isEqualTo(300L); + assertThat(projected.isNullAt(3)).as("Should contain empty value on new column 4").isTrue(); + } + + @Test + public void testEmptyProjection() throws Exception { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + RowData row = GenericRowData.of(34L, StringData.fromString("test")); + + RowData projected = writeAndRead("empty_projection", schema, schema.select(), row); + + assertThat(projected).isNotNull(); + assertThat(projected.getArity()).isEqualTo(0); + } + + @Test + public void testBasicProjection() throws Exception { + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + RowData row = GenericRowData.of(34L, StringData.fromString("test")); + + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); + + RowData projected = writeAndRead("basic_projection_id", writeSchema, idOnly, row); + assertThat(projected.getArity()).as("Should not project data").isEqualTo(1); + assertThat(projected.getLong(0)).isEqualTo(34L); + + Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); + + projected = writeAndRead("basic_projection_data", writeSchema, dataOnly, row); + + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + int cmp = Comparators.charSequences().compare("test", projected.getString(0).toString()); + assertThat(projected.getString(0)).asString().isEqualTo("test"); + } + + @Test + public void testRename() throws Exception { + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + RowData row = GenericRowData.of(34L, StringData.fromString("test")); + + Schema readSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "renamed", Types.StringType.get())); + + RowData projected = writeAndRead("project_and_rename", writeSchema, readSchema, row); + + assertThat(projected.getLong(0)).isEqualTo(34L); + assertThat(projected.getString(1)) + .as("Should contain the correct data/renamed value") + .asString() + .isEqualTo("test"); + } + + @Test + public void testNestedStructProjection() throws Exception { + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); + + RowData location = GenericRowData.of(52.995143f, -1.539054f); + RowData record = GenericRowData.of(34L, location); + + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); + + RowData projected = writeAndRead("id_only", writeSchema, idOnly, record); + assertThat(projected.getArity()).isEqualTo(1); + assertThat(projected.getLong(0)).as("Should contain the correct id value").isEqualTo(34L); + + Schema latOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); + + projected = writeAndRead("latitude_only", writeSchema, latOnly, record); + RowData projectedLocation = projected.getRow(0, 1); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.isNullAt(0)).as("Should project location").isFalse(); + assertThat(projectedLocation.getArity()).as("Should not project longitude").isEqualTo(1); + assertThat(projectedLocation.getFloat(0)) + .as("Should project latitude") + .isEqualTo(52.995143f, withPrecision(0.000001f)); + + Schema longOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); + + projected = writeAndRead("longitude_only", writeSchema, longOnly, record); + projectedLocation = projected.getRow(0, 1); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.isNullAt(0)).as("Should project location").isFalse(); + assertThat(projectedLocation.getArity()).as("Should not project latitutde").isEqualTo(1); + assertThat(projectedLocation.getFloat(0)) + .as("Should project longitude") + .isEqualTo(-1.539054f, withPrecision(0.000001f)); + + Schema locationOnly = writeSchema.select("location"); + projected = writeAndRead("location_only", writeSchema, locationOnly, record); + projectedLocation = projected.getRow(0, 1); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.isNullAt(0)).as("Should project location").isFalse(); + assertThat(projectedLocation.getFloat(0)) + .as("Should project latitude") + .isEqualTo(52.995143f, withPrecision(0.000001f)); + assertThat(projectedLocation.getFloat(1)) + .as("Should project longitude") + .isEqualTo(-1.539054f, withPrecision(0.000001f)); + } + + @Test + public void testMapProjection() throws IOException { + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "properties", + Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get()))); + + GenericMapData properties = + new GenericMapData( + ImmutableMap.of( + StringData.fromString("a"), + StringData.fromString("A"), + StringData.fromString("b"), + StringData.fromString("B"))); + + RowData row = GenericRowData.of(34L, properties); + + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); + + RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); + assertThat(projected.getLong(0)).isEqualTo(34L); + assertThat(projected.getArity()).as("Should not project properties map").isEqualTo(1); + + Schema keyOnly = writeSchema.select("properties.key"); + projected = writeAndRead("key_only", writeSchema, keyOnly, row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.getMap(0)).isEqualTo(properties); + + Schema valueOnly = writeSchema.select("properties.value"); + projected = writeAndRead("value_only", writeSchema, valueOnly, row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.getMap(0)).isEqualTo(properties); + + Schema mapOnly = writeSchema.select("properties"); + projected = writeAndRead("map_only", writeSchema, mapOnly, row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.getMap(0)).isEqualTo(properties); + } + + @Test + public void testMapOfStructsProjection() throws IOException { + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()))))); + + RowData l1 = GenericRowData.of(53.992811f, -1.542616f); + RowData l2 = GenericRowData.of(52.995143f, -1.539054f); + GenericMapData map = + new GenericMapData( + ImmutableMap.of(StringData.fromString("L1"), l1, StringData.fromString("L2"), l2)); + RowData row = GenericRowData.of(34L, map); + + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); + + RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); + assertThat(projected.getLong(0)).isEqualTo(34L); + assertThat(projected.getArity()).as("Should not project locations map").isEqualTo(1); + + projected = writeAndRead("all_locations", writeSchema, writeSchema.select("locations"), row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.getMap(0)).isEqualTo(row.getMap(1)); + + projected = writeAndRead("lat_only", writeSchema, writeSchema.select("locations.lat"), row); + GenericMapData locations = (GenericMapData) projected.getMap(0); + assertThat(locations).isNotNull(); + GenericArrayData l1l2Array = + new GenericArrayData( + new Object[] {StringData.fromString("L2"), StringData.fromString("L1")}); + assertThat(locations.keyArray()).isEqualTo(l1l2Array); + RowData projectedL1 = (RowData) locations.get(StringData.fromString("L1")); + assertThat(projectedL1).isNotNull(); + assertThat(projectedL1.getFloat(0)) + .as("L1 should contain lat") + .isEqualTo(53.992811f, withPrecision(0.000001f)); + assertThat(projectedL1.getArity()).as("L1 should not contain long").isEqualTo(1); + RowData projectedL2 = (RowData) locations.get(StringData.fromString("L2")); + assertThat(projectedL2).isNotNull(); + assertThat(projectedL2.getFloat(0)) + .as("L2 should contain lat") + .isEqualTo(52.995143f, withPrecision(0.000001f)); + assertThat(projectedL2.getArity()).as("L2 should not contain long").isEqualTo(1); + + projected = writeAndRead("long_only", writeSchema, writeSchema.select("locations.long"), row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + locations = (GenericMapData) projected.getMap(0); + assertThat(locations).isNotNull(); + assertThat(locations.keyArray()).isEqualTo(l1l2Array); + projectedL1 = (RowData) locations.get(StringData.fromString("L1")); + assertThat(projectedL1).isNotNull(); + assertThat(projectedL1.getArity()).as("L1 should not contain lat").isEqualTo(1); + assertThat(projectedL1.getFloat(0)) + .as("L1 should contain long") + .isEqualTo(-1.542616f, withPrecision(0.000001f)); + projectedL2 = (RowData) locations.get(StringData.fromString("L2")); + assertThat(projectedL2).isNotNull(); + assertThat(projectedL2.getArity()).as("L2 should not contain lat").isEqualTo(1); + assertThat(projectedL2.getFloat(0)) + .as("L2 should contain long") + .isEqualTo(-1.539054f, withPrecision(0.000001f)); + + Schema latitiudeRenamed = + new Schema( + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "latitude", Types.FloatType.get()))))); + + projected = writeAndRead("latitude_renamed", writeSchema, latitiudeRenamed, row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + locations = (GenericMapData) projected.getMap(0); + assertThat(locations).isNotNull(); + assertThat(locations.keyArray()).isEqualTo(l1l2Array); + projectedL1 = (RowData) locations.get(StringData.fromString("L1")); + assertThat(projectedL1).isNotNull(); + assertThat(projectedL1.getFloat(0)) + .as("L1 should contain latitude") + .isEqualTo(53.992811f, withPrecision(0.000001f)); + projectedL2 = (RowData) locations.get(StringData.fromString("L2")); + assertThat(projectedL2).isNotNull(); + assertThat(projectedL2.getFloat(0)) + .as("L2 should contain latitude") + .isEqualTo(52.995143f, withPrecision(0.000001f)); + } + + @Test + public void testListProjection() throws IOException { + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 10, "values", Types.ListType.ofOptional(11, Types.LongType.get()))); + + GenericArrayData values = new GenericArrayData(new Long[] {56L, 57L, 58L}); + + RowData row = GenericRowData.of(34L, values); + + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); + + RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); + assertThat(projected.getLong(0)).isEqualTo(34L); + assertThat(projected.getArity()).as("Should not project values list").isEqualTo(1); + + Schema elementOnly = writeSchema.select("values.element"); + projected = writeAndRead("element_only", writeSchema, elementOnly, row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.getArray(0)).isEqualTo(values); + + Schema listOnly = writeSchema.select("values"); + projected = writeAndRead("list_only", writeSchema, listOnly, row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.getArray(0)).isEqualTo(values); + } + + @Test + public void testListOfStructsProjection() throws IOException { + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get()))))); + + RowData p1 = GenericRowData.of(1, 2); + RowData p2 = GenericRowData.of(3, null); + GenericArrayData arrayData = new GenericArrayData(new RowData[] {p1, p2}); + RowData row = GenericRowData.of(34L, arrayData); + + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); + + RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); + assertThat(projected.getLong(0)).isEqualTo(34L); + assertThat(projected.getArity()).isEqualTo(1); + + projected = writeAndRead("all_points", writeSchema, writeSchema.select("points"), row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.getArray(0)).isEqualTo(row.getArray(1)); + + projected = writeAndRead("x_only", writeSchema, writeSchema.select("points.x"), row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.isNullAt(0)).isFalse(); + ArrayData points = projected.getArray(0); + assertThat(points.size()).isEqualTo(2); + RowData projectedP1 = points.getRow(0, 2); + assertThat(projectedP1.getInt(0)).as("Should project x").isEqualTo(1); + assertThat(projectedP1.getArity()).as("Should not project y").isEqualTo(1); + RowData projectedP2 = points.getRow(1, 2); + assertThat(projectedP2.getArity()).as("Should not project y").isEqualTo(1); + assertThat(projectedP2.getInt(0)).as("Should project x").isEqualTo(3); + + projected = writeAndRead("y_only", writeSchema, writeSchema.select("points.y"), row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.isNullAt(0)).isFalse(); + points = projected.getArray(0); + assertThat(points.size()).isEqualTo(2); + projectedP1 = points.getRow(0, 2); + assertThat(projectedP1.getArity()).as("Should not project x").isEqualTo(1); + assertThat(projectedP1.getInt(0)).as("Should project y").isEqualTo(2); + projectedP2 = points.getRow(1, 2); + assertThat(projectedP2.getArity()).as("Should not project x").isEqualTo(1); + assertThat(projectedP2.isNullAt(0)).as("Should project null y").isTrue(); + + Schema yRenamed = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.optional(18, "z", Types.IntegerType.get()))))); + + projected = writeAndRead("y_renamed", writeSchema, yRenamed, row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.isNullAt(0)).isFalse(); + points = projected.getArray(0); + assertThat(points.size()).isEqualTo(2); + projectedP1 = points.getRow(0, 2); + assertThat(projectedP1.getArity()).as("Should not project x and y").isEqualTo(1); + assertThat(projectedP1.getInt(0)).as("Should project z").isEqualTo(2); + projectedP2 = points.getRow(1, 2); + assertThat(projectedP2.getArity()).as("Should not project x and y").isEqualTo(1); + assertThat(projectedP2.isNullAt(0)).as("Should project null z").isTrue(); + } + + @Test + public void testAddedFieldsWithRequiredChildren() throws Exception { + Schema schema = new Schema(Types.NestedField.required(1, "a", Types.LongType.get())); + + RowData row = GenericRowData.of(100L); + + Schema addedFields = + new Schema( + Types.NestedField.optional(1, "a", Types.LongType.get()), + Types.NestedField.optional( + 2, + "b", + Types.StructType.of(Types.NestedField.required(3, "c", Types.LongType.get()))), + Types.NestedField.optional(4, "d", Types.ListType.ofRequired(5, Types.LongType.get())), + Types.NestedField.optional( + 6, + "e", + Types.MapType.ofRequired(7, 8, Types.LongType.get(), Types.LongType.get()))); + + RowData projected = + writeAndRead("add_fields_with_required_children_projection", schema, addedFields, row); + assertThat(projected.getLong(0)) + .as("Should contain the correct value in column 1") + .isEqualTo(100L); + assertThat(projected.isNullAt(1)).as("Should contain empty value in new column 2").isTrue(); + assertThat(projected.isNullAt(2)).as("Should contain empty value in new column 4").isTrue(); + assertThat(projected.isNullAt(3)).as("Should contain empty value in new column 6").isTrue(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java new file mode 100644 index 000000000000..eccab20e04fc --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.flink.DataGenerator; +import org.apache.iceberg.flink.DataGenerators; +import org.apache.iceberg.flink.TestHelpers; +import org.junit.jupiter.api.Test; + +public class TestStructRowData { + + protected void testConverter(DataGenerator dataGenerator) { + StructRowData converter = new StructRowData(dataGenerator.icebergSchema().asStruct()); + GenericRecord expected = dataGenerator.generateIcebergGenericRecord(); + StructRowData actual = converter.setStruct(expected); + TestHelpers.assertRowData(dataGenerator.icebergSchema(), expected, actual); + } + + @Test + public void testPrimitiveTypes() { + testConverter(new DataGenerators.Primitives()); + } + + @Test + public void testStructOfPrimitive() { + testConverter(new DataGenerators.StructOfPrimitive()); + } + + @Test + public void testStructOfArray() { + testConverter(new DataGenerators.StructOfArray()); + } + + @Test + public void testStructOfMap() { + testConverter(new DataGenerators.StructOfMap()); + } + + @Test + public void testStructOfStruct() { + testConverter(new DataGenerators.StructOfStruct()); + } + + @Test + public void testArrayOfPrimitive() { + testConverter(new DataGenerators.ArrayOfPrimitive()); + } + + @Test + public void testArrayOfArray() { + testConverter(new DataGenerators.ArrayOfArray()); + } + + @Test + public void testArrayOfMap() { + testConverter(new DataGenerators.ArrayOfMap()); + } + + @Test + public void testArrayOfStruct() { + testConverter(new DataGenerators.ArrayOfStruct()); + } + + @Test + public void testMapOfPrimitives() { + testConverter(new DataGenerators.MapOfPrimitives()); + } + + @Test + public void testMapOfArray() { + testConverter(new DataGenerators.MapOfArray()); + } + + @Test + public void testMapOfMap() { + testConverter(new DataGenerators.MapOfMap()); + } + + @Test + public void testMapOfStruct() { + testConverter(new DataGenerators.MapOfStruct()); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskInfraExtension.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskInfraExtension.java new file mode 100644 index 000000000000..10efb9120c6e --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskInfraExtension.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import static org.apache.iceberg.flink.maintenance.operator.OperatorTestBase.IGNORED_OPERATOR_NAME; + +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.iceberg.flink.maintenance.operator.CollectingSink; +import org.apache.iceberg.flink.maintenance.operator.ManualSource; +import org.junit.jupiter.api.extension.BeforeEachCallback; +import org.junit.jupiter.api.extension.ExtensionContext; + +/** + * {@link org.junit.jupiter.api.extension.Extension} used to generate the common elements for the + * {@link MaintenanceTaskBuilder} implementations. These are the following: + * + *

      + *
    • {@link StreamExecutionEnvironment} - environment for testing + *
    • {@link ManualSource} - source for manually emitting {@link Trigger}s + *
    • {@link DataStream} - which generated from the {@link ManualSource} + *
    • {@link CollectingSink} - which could be used poll for the records emitted by the + * maintenance tasks + *
    + */ +class MaintenanceTaskInfraExtension implements BeforeEachCallback { + private StreamExecutionEnvironment env; + private ManualSource source; + private DataStream triggerStream; + private CollectingSink sink; + + @Override + public void beforeEach(ExtensionContext context) { + this.env = StreamExecutionEnvironment.getExecutionEnvironment(); + this.source = new ManualSource<>(env, TypeInformation.of(Trigger.class)); + // Adds the watermark to mimic the behaviour expected for the input of the maintenance tasks + this.triggerStream = + source + .dataStream() + .assignTimestampsAndWatermarks(new TableMaintenance.PunctuatedWatermarkStrategy()) + .name(IGNORED_OPERATOR_NAME) + .forceNonParallel(); + this.sink = new CollectingSink<>(); + } + + StreamExecutionEnvironment env() { + return env; + } + + ManualSource source() { + return source; + } + + DataStream triggerStream() { + return triggerStream; + } + + CollectingSink sink() { + return sink; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskTestBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskTestBase.java new file mode 100644 index 000000000000..fc8f7ad5124e --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskTestBase.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.time.Duration; +import java.util.function.Supplier; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.iceberg.flink.maintenance.operator.CollectingSink; +import org.apache.iceberg.flink.maintenance.operator.ManualSource; +import org.apache.iceberg.flink.maintenance.operator.OperatorTestBase; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.extension.RegisterExtension; + +class MaintenanceTaskTestBase extends OperatorTestBase { + private static final int TESTING_TASK_ID = 0; + private static final Duration POLL_DURATION = Duration.ofSeconds(5); + + @RegisterExtension MaintenanceTaskInfraExtension infra = new MaintenanceTaskInfraExtension(); + + void runAndWaitForSuccess( + StreamExecutionEnvironment env, + ManualSource triggerSource, + CollectingSink collectingSink) + throws Exception { + runAndWaitForResult( + env, + triggerSource, + collectingSink, + false /* generateFailure */, + () -> true /* waitForCondition */, + true /* resultSuccess */); + } + + void runAndWaitForSuccess( + StreamExecutionEnvironment env, + ManualSource triggerSource, + CollectingSink collectingSink, + Supplier waitForCondition) + throws Exception { + runAndWaitForResult( + env, + triggerSource, + collectingSink, + false /* generateFailure */, + waitForCondition, + true /* resultSuccess */); + } + + void runAndWaitForFailure( + StreamExecutionEnvironment env, + ManualSource triggerSource, + CollectingSink collectingSink) + throws Exception { + runAndWaitForResult( + env, + triggerSource, + collectingSink, + true /* generateFailure */, + () -> true /* waitForCondition */, + true /* resultSuccess */); + } + + void runAndWaitForResult( + StreamExecutionEnvironment env, + ManualSource triggerSource, + CollectingSink collectingSink, + boolean generateFailure, + Supplier waitForCondition, + boolean resultSuccess) + throws Exception { + JobClient jobClient = null; + try { + jobClient = env.executeAsync(); + + // Do a single successful task run + long time = System.currentTimeMillis(); + triggerSource.sendRecord(Trigger.create(time, TESTING_TASK_ID), time); + + TaskResult result = collectingSink.poll(POLL_DURATION); + + assertThat(result.startEpoch()).isEqualTo(time); + assertThat(result.success()).isEqualTo(resultSuccess); + assertThat(result.taskIndex()).isEqualTo(TESTING_TASK_ID); + + if (generateFailure) { + dropTable(); + time = System.currentTimeMillis(); + triggerSource.sendRecord(Trigger.create(time, TESTING_TASK_ID), time); + result = collectingSink.poll(POLL_DURATION); + + assertThat(result.startEpoch()).isEqualTo(time); + assertThat(result.success()).isFalse(); + assertThat(result.taskIndex()).isEqualTo(TESTING_TASK_ID); + } + + Awaitility.await().until(waitForCondition::get); + } finally { + closeJobClient(jobClient); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestDeleteOrphanFiles.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestDeleteOrphanFiles.java new file mode 100644 index 000000000000..12f5269773d1 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestDeleteOrphanFiles.java @@ -0,0 +1,340 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import static org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles.DELETE_FILES_TASK_NAME; +import static org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles.FILESYSTEM_FILES_TASK_NAME; +import static org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles.METADATA_FILES_TASK_NAME; +import static org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles.PLANNER_TASK_NAME; +import static org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles.READER_TASK_NAME; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.DELETE_FILE_FAILED_COUNTER; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.DELETE_FILE_SUCCEEDED_COUNTER; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.ERROR_COUNTER; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.FileSystems; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Duration; +import java.util.List; +import java.util.stream.StreamSupport; +import org.apache.flink.streaming.api.graph.StreamGraphGenerator; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.ManifestFiles; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.maintenance.operator.MetricsReporterFactoryForTests; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.Test; + +class TestDeleteOrphanFiles extends MaintenanceTaskTestBase { + + private Path relative(Table table, String relativePath) { + return FileSystems.getDefault().getPath(table.location().substring(5), relativePath); + } + + private void createFiles(Path... paths) throws IOException { + for (Path path : paths) { + Files.write(path, "DUMMY".getBytes(StandardCharsets.UTF_8)); + } + } + + @Test + void testDeleteOrphanFilesUnPartitioned() throws Exception { + Table table = createTable(); + insert(table, 1, "a"); + insert(table, 2, "b"); + insert(table, 3, "c"); + insert(table, 4, "d"); + + assertFileNum(table, 4, 0); + + Path inData = relative(table, "metadata/in_data"); + Path inMetadata = relative(table, "metadata/in_metadata"); + + createFiles(inData); + createFiles(inMetadata); + assertThat(inMetadata).exists(); + assertThat(inData).exists(); + + appendDeleteOrphanFiles(); + + runAndWaitForSuccess( + infra.env(), infra.source(), infra.sink(), () -> checkDeleteFinished(table.name(), 2L)); + assertThat(inMetadata).doesNotExist(); + assertThat(inData).doesNotExist(); + assertFileNum(table, 4, 0); + + // Check the metrics + MetricsReporterFactoryForTests.assertCounters( + new ImmutableMap.Builder, Long>() + .put( + ImmutableList.of( + PLANNER_TASK_NAME + "[0]", table.name(), DUMMY_TASK_NAME, "0", ERROR_COUNTER), + 0L) + .put( + ImmutableList.of( + READER_TASK_NAME + "[0]", table.name(), DUMMY_TASK_NAME, "0", ERROR_COUNTER), + 0L) + .put( + ImmutableList.of( + FILESYSTEM_FILES_TASK_NAME + "[0]", + table.name(), + DUMMY_TASK_NAME, + "0", + ERROR_COUNTER), + 0L) + .put( + ImmutableList.of( + METADATA_FILES_TASK_NAME + "[0]", + table.name(), + DUMMY_TASK_NAME, + "0", + ERROR_COUNTER), + 0L) + .put( + ImmutableList.of( + DELETE_FILES_TASK_NAME + "[0]", + table.name(), + DUMMY_TASK_NAME, + "0", + DELETE_FILE_FAILED_COUNTER), + 0L) + .put( + ImmutableList.of( + DELETE_FILES_TASK_NAME + "[0]", + table.name(), + DUMMY_TASK_NAME, + "0", + DELETE_FILE_SUCCEEDED_COUNTER), + 2L) + .build()); + } + + @Test + void testDeleteOrphanFilesPartitioned() throws Exception { + Table table = createPartitionedTable(); + insertPartitioned(table, 1, "p1"); + insertPartitioned(table, 2, "p1"); + insertPartitioned(table, 3, "p2"); + insertPartitioned(table, 4, "p2"); + + assertFileNum(table, 4, 0); + + Path inMetadata = relative(table, "metadata/in_metadata"); + Path inData = relative(table, "metadata/in_data"); + + createFiles(inMetadata); + createFiles(inData); + assertThat(inMetadata).exists(); + assertThat(inData).exists(); + + appendDeleteOrphanFiles(); + + runAndWaitForSuccess( + infra.env(), infra.source(), infra.sink(), () -> checkDeleteFinished(table.name(), 2L)); + assertThat(inMetadata).doesNotExist(); + assertThat(inData).doesNotExist(); + + assertFileNum(table, 4, 0); + + // Check the metrics + MetricsReporterFactoryForTests.assertCounters( + new ImmutableMap.Builder, Long>() + .put( + ImmutableList.of( + PLANNER_TASK_NAME + "[0]", table.name(), DUMMY_TASK_NAME, "0", ERROR_COUNTER), + 0L) + .put( + ImmutableList.of( + READER_TASK_NAME + "[0]", table.name(), DUMMY_TASK_NAME, "0", ERROR_COUNTER), + 0L) + .put( + ImmutableList.of( + FILESYSTEM_FILES_TASK_NAME + "[0]", + table.name(), + DUMMY_TASK_NAME, + "0", + ERROR_COUNTER), + 0L) + .put( + ImmutableList.of( + METADATA_FILES_TASK_NAME + "[0]", + table.name(), + DUMMY_TASK_NAME, + "0", + ERROR_COUNTER), + 0L) + .put( + ImmutableList.of( + DELETE_FILES_TASK_NAME + "[0]", + table.name(), + DUMMY_TASK_NAME, + "0", + DELETE_FILE_FAILED_COUNTER), + 0L) + .put( + ImmutableList.of( + DELETE_FILES_TASK_NAME + "[0]", + table.name(), + DUMMY_TASK_NAME, + "0", + DELETE_FILE_SUCCEEDED_COUNTER), + 2L) + .build()); + } + + @Test + void testDeleteOrphanFilesFailure() throws Exception { + Table table = createTable(); + insert(table, 1, "a"); + insert(table, 2, "b"); + insert(table, 3, "c"); + insert(table, 4, "d"); + + assertFileNum(table, 4, 0); + + Path inData = relative(table, "metadata/in_data"); + Path inMetadata = relative(table, "metadata/in_metadata"); + + createFiles(inData); + createFiles(inMetadata); + assertThat(inMetadata).exists(); + assertThat(inData).exists(); + + appendDeleteOrphanFiles(); + + // Mock error in the delete files operator + Long parentId = table.currentSnapshot().parentId(); + for (ManifestFile manifestFile : table.snapshot(parentId).allManifests(table.io())) { + table.io().deleteFile(manifestFile.path()); + } + + runAndWaitForResult( + infra.env(), + infra.source(), + infra.sink(), + false /* generateFailure */, + () -> checkDeleteFinished(table.name(), 0L), + false /* resultSuccess*/); + + // An error occurred; the file should not be deleted. And the job should not be failed. + assertThat(inMetadata).exists(); + assertThat(inData).exists(); + + // Check the metrics + MetricsReporterFactoryForTests.assertCounters( + new ImmutableMap.Builder, Long>() + .put( + ImmutableList.of( + PLANNER_TASK_NAME + "[0]", table.name(), DUMMY_TASK_NAME, "0", ERROR_COUNTER), + 0L) + .put( + ImmutableList.of( + READER_TASK_NAME + "[0]", table.name(), DUMMY_TASK_NAME, "0", ERROR_COUNTER), + 1L) + .put( + ImmutableList.of( + FILESYSTEM_FILES_TASK_NAME + "[0]", + table.name(), + DUMMY_TASK_NAME, + "0", + ERROR_COUNTER), + 0L) + .put( + ImmutableList.of( + METADATA_FILES_TASK_NAME + "[0]", + table.name(), + DUMMY_TASK_NAME, + "0", + ERROR_COUNTER), + 0L) + .put( + ImmutableList.of( + DELETE_FILES_TASK_NAME + "[0]", + table.name(), + DUMMY_TASK_NAME, + "0", + DELETE_FILE_FAILED_COUNTER), + 0L) + .put( + ImmutableList.of( + DELETE_FILES_TASK_NAME + "[0]", + table.name(), + DUMMY_TASK_NAME, + "0", + DELETE_FILE_SUCCEEDED_COUNTER), + 0L) + .build()); + } + + private void appendDeleteOrphanFiles() { + appendDeleteOrphanFiles(DeleteOrphanFiles.builder().minAge(Duration.ZERO)); + } + + private void appendDeleteOrphanFiles(DeleteOrphanFiles.Builder builder) { + builder + .append( + infra.triggerStream(), + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + 0, + tableLoader(), + UID_SUFFIX, + StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP, + 1) + .sinkTo(infra.sink()); + } + + private static void assertFileNum( + Table table, int expectedDataFileNum, int expectedDeleteFileNum) { + table.refresh(); + assertThat( + table.currentSnapshot().dataManifests(table.io()).stream() + .flatMap( + m -> + StreamSupport.stream( + ManifestFiles.read(m, table.io(), table.specs()).spliterator(), false)) + .count()) + .isEqualTo(expectedDataFileNum); + assertThat( + table.currentSnapshot().deleteManifests(table.io()).stream() + .flatMap( + m -> + StreamSupport.stream( + ManifestFiles.readDeleteManifest(m, table.io(), table.specs()) + .spliterator(), + false)) + .count()) + .isEqualTo(expectedDeleteFileNum); + } + + private static boolean checkDeleteFinished(String tableName, Long expectedDeleteNum) { + return expectedDeleteNum.equals( + MetricsReporterFactoryForTests.counter( + ImmutableList.of( + DELETE_FILES_TASK_NAME + "[0]", + tableName, + DUMMY_TASK_NAME, + "0", + DELETE_FILE_SUCCEEDED_COUNTER))); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestExpireSnapshots.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestExpireSnapshots.java new file mode 100644 index 000000000000..b8aa259e2f17 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestExpireSnapshots.java @@ -0,0 +1,233 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import static org.apache.iceberg.flink.SimpleDataUtil.createRecord; +import static org.apache.iceberg.flink.maintenance.api.ExpireSnapshots.DELETE_FILES_OPERATOR_NAME; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.DELETE_FILE_FAILED_COUNTER; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.DELETE_FILE_SUCCEEDED_COUNTER; +import static org.assertj.core.api.Assertions.assertThat; + +import java.time.Duration; +import java.util.List; +import java.util.Set; +import org.apache.flink.streaming.api.graph.StreamGraphGenerator; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.maintenance.operator.MetricsReporterFactoryForTests; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +class TestExpireSnapshots extends MaintenanceTaskTestBase { + private Table table; + + @BeforeEach + void before() { + MetricsReporterFactoryForTests.reset(); + this.table = createTable(); + tableLoader().open(); + } + + @Test + void testExpireSnapshots() throws Exception { + insert(table, 1, "a"); + insert(table, 2, "b"); + insert(table, 3, "c"); + insert(table, 4, "d"); + + Set snapshots = Sets.newHashSet(table.snapshots()); + assertThat(snapshots).hasSize(4); + + ExpireSnapshots.builder() + .parallelism(1) + .planningWorkerPoolSize(2) + .deleteBatchSize(3) + .maxSnapshotAge(Duration.ZERO) + .retainLast(1) + .uidSuffix(UID_SUFFIX) + .append( + infra.triggerStream(), + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + 0, + tableLoader(), + "OTHER", + StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP, + 1) + .sinkTo(infra.sink()); + + runAndWaitForSuccess( + infra.env(), infra.source(), infra.sink(), () -> checkDeleteFinished(table.name(), 3L)); + + table.refresh(); + assertThat(Sets.newHashSet(table.snapshots())).hasSize(1); + // Check that the table data not changed + SimpleDataUtil.assertTableRecords( + table, + ImmutableList.of( + createRecord(1, "a"), + createRecord(2, "b"), + createRecord(3, "c"), + createRecord(4, "d"))); + } + + @Test + void testFailure() throws Exception { + insert(table, 1, "a"); + insert(table, 2, "b"); + + ExpireSnapshots.builder() + .append( + infra.triggerStream(), + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + 0, + tableLoader(), + UID_SUFFIX, + StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP, + 1) + .sinkTo(infra.sink()); + + runAndWaitForFailure(infra.env(), infra.source(), infra.sink()); + + // Check the metrics. There are no expired snapshots or data files because ExpireSnapshots has + // no max age of number of snapshots set, so no files are removed. + MetricsReporterFactoryForTests.assertCounters( + new ImmutableMap.Builder, Long>() + .put( + ImmutableList.of( + DELETE_FILES_OPERATOR_NAME + "[0]", + table.name(), + DUMMY_TASK_NAME, + "0", + DELETE_FILE_FAILED_COUNTER), + 0L) + .put( + ImmutableList.of( + DELETE_FILES_OPERATOR_NAME + "[0]", + table.name(), + DUMMY_TASK_NAME, + "0", + DELETE_FILE_SUCCEEDED_COUNTER), + 0L) + .build()); + } + + @Test + void testUidAndSlotSharingGroup() { + ExpireSnapshots.builder() + .slotSharingGroup(SLOT_SHARING_GROUP) + .uidSuffix(UID_SUFFIX) + .append( + infra.triggerStream(), + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + 0, + tableLoader(), + UID_SUFFIX, + StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP, + 1) + .sinkTo(infra.sink()); + + checkUidsAreSet(infra.env(), UID_SUFFIX); + checkSlotSharingGroupsAreSet(infra.env(), SLOT_SHARING_GROUP); + } + + @Test + void testUidAndSlotSharingGroupUnset() { + ExpireSnapshots.builder() + .append( + infra.triggerStream(), + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + 0, + tableLoader(), + UID_SUFFIX, + StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP, + 1) + .sinkTo(infra.sink()); + + checkUidsAreSet(infra.env(), null); + checkSlotSharingGroupsAreSet(infra.env(), null); + } + + @Test + void testMetrics() throws Exception { + insert(table, 1, "a"); + insert(table, 2, "b"); + + ExpireSnapshots.builder() + .maxSnapshotAge(Duration.ZERO) + .retainLast(1) + .parallelism(1) + .append( + infra.triggerStream(), + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + 0, + tableLoader(), + UID_SUFFIX, + StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP, + 1) + .sinkTo(infra.sink()); + + runAndWaitForSuccess( + infra.env(), infra.source(), infra.sink(), () -> checkDeleteFinished(table.name(), 1L)); + + // Check the metrics + Awaitility.await() + .untilAsserted( + () -> + MetricsReporterFactoryForTests.assertCounters( + new ImmutableMap.Builder, Long>() + .put( + ImmutableList.of( + DELETE_FILES_OPERATOR_NAME + "[0]", + table.name(), + DUMMY_TASK_NAME, + "0", + DELETE_FILE_FAILED_COUNTER), + 0L) + .put( + ImmutableList.of( + DELETE_FILES_OPERATOR_NAME + "[0]", + table.name(), + DUMMY_TASK_NAME, + "0", + DELETE_FILE_SUCCEEDED_COUNTER), + 1L) + .build())); + } + + private static boolean checkDeleteFinished(String tableName, Long expectedDeleteNum) { + return expectedDeleteNum.equals( + MetricsReporterFactoryForTests.counter( + ImmutableList.of( + DELETE_FILES_OPERATOR_NAME + "[0]", + tableName, + DUMMY_TASK_NAME, + "0", + DELETE_FILE_SUCCEEDED_COUNTER))); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java new file mode 100644 index 000000000000..3cb18ffbb77e --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import static org.apache.iceberg.flink.maintenance.api.JdbcLockFactory.INIT_LOCK_TABLES_PROPERTY; + +import java.util.Map; +import java.util.UUID; +import org.apache.iceberg.jdbc.JdbcCatalog; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +class TestJdbcLockFactory extends TestLockFactoryBase { + @Override + TriggerLockFactory lockFactory(String tableName) { + Map properties = Maps.newHashMap(); + properties.put(JdbcCatalog.PROPERTY_PREFIX + "username", "user"); + properties.put(JdbcCatalog.PROPERTY_PREFIX + "password", "password"); + properties.put(INIT_LOCK_TABLES_PROPERTY, "true"); + + return new JdbcLockFactory( + "jdbc:sqlite:file::memory:?ic" + UUID.randomUUID().toString().replace("-", ""), + tableName, + properties); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestLockFactoryBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestLockFactoryBase.java new file mode 100644 index 000000000000..8a1b286ef591 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestLockFactoryBase.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +abstract class TestLockFactoryBase { + protected TriggerLockFactory lockFactory; + + abstract TriggerLockFactory lockFactory(String tableName); + + @BeforeEach + void before() { + this.lockFactory = lockFactory("tableName"); + lockFactory.open(); + } + + @AfterEach + void after() throws IOException { + lockFactory.close(); + } + + @Test + void testTryLock() { + TriggerLockFactory.Lock lock1 = lockFactory.createLock(); + TriggerLockFactory.Lock lock2 = lockFactory.createLock(); + assertThat(lock1.tryLock()).isTrue(); + assertThat(lock1.tryLock()).isFalse(); + assertThat(lock2.tryLock()).isFalse(); + } + + @Test + void testUnLock() { + TriggerLockFactory.Lock lock = lockFactory.createLock(); + assertThat(lock.tryLock()).isTrue(); + + lock.unlock(); + assertThat(lock.tryLock()).isTrue(); + } + + @Test + void testNoConflictWithRecoveryLock() { + TriggerLockFactory.Lock lock1 = lockFactory.createLock(); + TriggerLockFactory.Lock lock2 = lockFactory.createRecoveryLock(); + assertThat(lock1.tryLock()).isTrue(); + assertThat(lock2.tryLock()).isTrue(); + } + + @Test + void testDoubleUnLock() { + TriggerLockFactory.Lock lock = lockFactory.createLock(); + assertThat(lock.tryLock()).isTrue(); + + lock.unlock(); + lock.unlock(); + assertThat(lock.tryLock()).isTrue(); + assertThat(lock.tryLock()).isFalse(); + } + + @Test + void testMultiTableLock() throws IOException { + TriggerLockFactory other = lockFactory("tableName2"); + other.open(); + TriggerLockFactory.Lock lock1 = lockFactory.createLock(); + TriggerLockFactory.Lock lock2 = other.createLock(); + assertThat(lock1.tryLock()).isTrue(); + assertThat(lock2.tryLock()).isTrue(); + lock1.unlock(); + lock2.unlock(); + other.close(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestMaintenanceE2E.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestMaintenanceE2E.java new file mode 100644 index 000000000000..0a860fec4799 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestMaintenanceE2E.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.time.Duration; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.maintenance.operator.OperatorTestBase; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +class TestMaintenanceE2E extends OperatorTestBase { + private StreamExecutionEnvironment env; + + @BeforeEach + public void beforeEach() throws IOException { + this.env = StreamExecutionEnvironment.getExecutionEnvironment(); + Table table = createTable(); + insert(table, 1, "a"); + } + + @Test + void testE2e() throws Exception { + TableMaintenance.forTable(env, tableLoader(), LOCK_FACTORY) + .uidSuffix("E2eTestUID") + .rateLimit(Duration.ofMinutes(10)) + .lockCheckDelay(Duration.ofSeconds(10)) + .add( + ExpireSnapshots.builder() + .scheduleOnCommitCount(10) + .maxSnapshotAge(Duration.ofMinutes(10)) + .retainLast(5) + .deleteBatchSize(5) + .parallelism(8)) + .add( + RewriteDataFiles.builder() + .scheduleOnDataFileCount(10) + .partialProgressEnabled(true) + .partialProgressMaxCommits(10) + .maxRewriteBytes(1000L) + .targetFileSizeBytes(1000L) + .minFileSizeBytes(1000L) + .maxFileSizeBytes(1000L) + .minInputFiles(10) + .deleteFileThreshold(10) + .rewriteAll(false) + .maxFileGroupSizeBytes(1000L)) + .append(); + + JobClient jobClient = null; + try { + jobClient = env.executeAsync(); + + // Just make sure that we are able to instantiate the flow + assertThat(jobClient).isNotNull(); + } finally { + closeJobClient(jobClient); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java new file mode 100644 index 000000000000..795057e23538 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java @@ -0,0 +1,457 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import static org.apache.iceberg.flink.SimpleDataUtil.createRecord; +import static org.apache.iceberg.flink.maintenance.api.RewriteDataFiles.COMMIT_TASK_NAME; +import static org.apache.iceberg.flink.maintenance.api.RewriteDataFiles.PLANNER_TASK_NAME; +import static org.apache.iceberg.flink.maintenance.api.RewriteDataFiles.REWRITE_TASK_NAME; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.ADDED_DATA_FILE_NUM_METRIC; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.ADDED_DATA_FILE_SIZE_METRIC; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.ERROR_COUNTER; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.REMOVED_DATA_FILE_NUM_METRIC; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.REMOVED_DATA_FILE_SIZE_METRIC; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.stream.StreamSupport; +import org.apache.flink.streaming.api.graph.StreamGraphGenerator; +import org.apache.iceberg.ManifestFiles; +import org.apache.iceberg.Table; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.maintenance.operator.MetricsReporterFactoryForTests; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.Test; + +class TestRewriteDataFiles extends MaintenanceTaskTestBase { + @Test + void testRewriteUnpartitioned() throws Exception { + Table table = createTable(); + insert(table, 1, "a"); + insert(table, 2, "b"); + insert(table, 3, "c"); + insert(table, 4, "d"); + + assertFileNum(table, 4, 0); + + appendRewriteDataFiles( + RewriteDataFiles.builder() + .parallelism(2) + .deleteFileThreshold(10) + .targetFileSizeBytes(1_000_000L) + .maxFileGroupSizeBytes(10_000_000L) + .maxFileSizeBytes(2_000_000L) + .minFileSizeBytes(500_000L) + .minInputFiles(2) + .partialProgressEnabled(true) + .partialProgressMaxCommits(1) + .maxRewriteBytes(100_000L) + .rewriteAll(false)); + + runAndWaitForSuccess(infra.env(), infra.source(), infra.sink()); + + assertFileNum(table, 1, 0); + + SimpleDataUtil.assertTableRecords( + table, + ImmutableList.of( + createRecord(1, "a"), + createRecord(2, "b"), + createRecord(3, "c"), + createRecord(4, "d"))); + } + + @Test + void testRewritePartitioned() throws Exception { + Table table = createPartitionedTable(); + insertPartitioned(table, 1, "p1"); + insertPartitioned(table, 2, "p1"); + insertPartitioned(table, 3, "p2"); + insertPartitioned(table, 4, "p2"); + + assertFileNum(table, 4, 0); + + appendRewriteDataFiles(); + + runAndWaitForSuccess(infra.env(), infra.source(), infra.sink()); + + assertFileNum(table, 2, 0); + + SimpleDataUtil.assertTableRecords( + table, + ImmutableList.of( + createRecord(1, "p1"), + createRecord(2, "p1"), + createRecord(3, "p2"), + createRecord(4, "p2"))); + } + + @Test + void testPlannerFailure() throws Exception { + Table table = createTable(); + insert(table, 1, "a"); + insert(table, 2, "b"); + + assertFileNum(table, 2, 0); + + appendRewriteDataFiles(); + + runAndWaitForFailure(infra.env(), infra.source(), infra.sink()); + + // Check the metrics. The first task should be successful, but the second one should fail. This + // should be represented in the counters. + MetricsReporterFactoryForTests.assertCounters( + new ImmutableMap.Builder, Long>() + .put( + ImmutableList.of( + PLANNER_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + ERROR_COUNTER), + 1L) + .put( + ImmutableList.of( + REWRITE_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + ERROR_COUNTER), + 0L) + .put( + ImmutableList.of( + COMMIT_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + ERROR_COUNTER), + 0L) + .put( + ImmutableList.of( + COMMIT_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + ADDED_DATA_FILE_NUM_METRIC), + 1L) + .put( + ImmutableList.of( + COMMIT_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + ADDED_DATA_FILE_SIZE_METRIC), + -1L) + .put( + ImmutableList.of( + COMMIT_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + REMOVED_DATA_FILE_NUM_METRIC), + 2L) + .put( + ImmutableList.of( + COMMIT_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + REMOVED_DATA_FILE_SIZE_METRIC), + -1L) + .build()); + } + + @Test + void testUidAndSlotSharingGroup() { + createTable(); + + RewriteDataFiles.builder() + .slotSharingGroup(SLOT_SHARING_GROUP) + .uidSuffix(UID_SUFFIX) + .append( + infra.triggerStream(), + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + 0, + tableLoader(), + "OTHER", + "OTHER", + 1) + .sinkTo(infra.sink()); + + checkUidsAreSet(infra.env(), UID_SUFFIX); + checkSlotSharingGroupsAreSet(infra.env(), SLOT_SHARING_GROUP); + } + + @Test + void testUidAndSlotSharingGroupUnset() { + createTable(); + + RewriteDataFiles.builder() + .append( + infra.triggerStream(), + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + 0, + tableLoader(), + UID_SUFFIX, + StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP, + 1) + .sinkTo(infra.sink()); + + checkUidsAreSet(infra.env(), null); + checkSlotSharingGroupsAreSet(infra.env(), null); + } + + @Test + void testMetrics() throws Exception { + Table table = createTable(); + insert(table, 1, "a"); + insert(table, 2, "b"); + + assertFileNum(table, 2, 0); + + appendRewriteDataFiles(); + + runAndWaitForSuccess(infra.env(), infra.source(), infra.sink()); + + // Check the metrics + MetricsReporterFactoryForTests.assertCounters( + new ImmutableMap.Builder, Long>() + .put( + ImmutableList.of( + PLANNER_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + ERROR_COUNTER), + 0L) + .put( + ImmutableList.of( + REWRITE_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + ERROR_COUNTER), + 0L) + .put( + ImmutableList.of( + COMMIT_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + ERROR_COUNTER), + 0L) + .put( + ImmutableList.of( + COMMIT_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + ADDED_DATA_FILE_NUM_METRIC), + 1L) + .put( + ImmutableList.of( + COMMIT_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + ADDED_DATA_FILE_SIZE_METRIC), + -1L) + .put( + ImmutableList.of( + COMMIT_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + REMOVED_DATA_FILE_NUM_METRIC), + 2L) + .put( + ImmutableList.of( + COMMIT_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + REMOVED_DATA_FILE_SIZE_METRIC), + -1L) + .build()); + } + + @Test + void testV2Table() throws Exception { + Table table = createTableWithDelete(); + update(table, 1, null, "a", "b"); + update(table, 1, "b", "c"); + + assertFileNum(table, 2, 3); + SimpleDataUtil.assertTableRecords(table, ImmutableList.of(createRecord(1, "c"))); + + appendRewriteDataFiles(); + + runAndWaitForSuccess(infra.env(), infra.source(), infra.sink()); + + assertFileNum(table, 1, 1); + + SimpleDataUtil.assertTableRecords(table, ImmutableList.of(createRecord(1, "c"))); + + // Check the metrics + MetricsReporterFactoryForTests.assertCounters( + new ImmutableMap.Builder, Long>() + .put( + ImmutableList.of( + PLANNER_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + ERROR_COUNTER), + 0L) + .put( + ImmutableList.of( + REWRITE_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + ERROR_COUNTER), + 0L) + .put( + ImmutableList.of( + COMMIT_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + ERROR_COUNTER), + 0L) + .put( + ImmutableList.of( + COMMIT_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + ADDED_DATA_FILE_NUM_METRIC), + 1L) + .put( + ImmutableList.of( + COMMIT_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + ADDED_DATA_FILE_SIZE_METRIC), + -1L) + .put( + ImmutableList.of( + COMMIT_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + REMOVED_DATA_FILE_NUM_METRIC), + 2L) + .put( + ImmutableList.of( + COMMIT_TASK_NAME + "[0]", + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + "0", + REMOVED_DATA_FILE_SIZE_METRIC), + -1L) + .build()); + } + + @Test + void testRewriteWithFilter() throws Exception { + Table table = createTable(); + insert(table, 1, "a"); + insert(table, 2, "b"); + insert(table, 3, "c"); + insert(table, 4, "d"); + + assertFileNum(table, 4, 0); + + appendRewriteDataFiles( + RewriteDataFiles.builder() + .parallelism(2) + .deleteFileThreshold(10) + .targetFileSizeBytes(1_000_000L) + .maxFileGroupSizeBytes(10_000_000L) + .maxFileSizeBytes(2_000_000L) + .minFileSizeBytes(500_000L) + .minInputFiles(2) + // Only rewrite data files where id is 1 or 2 for testing rewrite + .filter(Expressions.in("id", 1, 2)) + .partialProgressEnabled(true) + .partialProgressMaxCommits(1) + .maxRewriteBytes(100_000L) + .rewriteAll(false)); + + runAndWaitForSuccess(infra.env(), infra.source(), infra.sink()); + + // There is four files, only id is 1 and 2 will be rewritten. so expect 3 files. + assertFileNum(table, 3, 0); + + SimpleDataUtil.assertTableRecords( + table, + ImmutableList.of( + createRecord(1, "a"), + createRecord(2, "b"), + createRecord(3, "c"), + createRecord(4, "d"))); + } + + private void appendRewriteDataFiles() { + appendRewriteDataFiles(RewriteDataFiles.builder().rewriteAll(true)); + } + + private void appendRewriteDataFiles(RewriteDataFiles.Builder builder) { + builder + .append( + infra.triggerStream(), + DUMMY_TABLE_NAME, + DUMMY_TASK_NAME, + 0, + tableLoader(), + UID_SUFFIX, + StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP, + 1) + .sinkTo(infra.sink()); + } + + private static void assertFileNum( + Table table, int expectedDataFileNum, int expectedDeleteFileNum) { + table.refresh(); + assertThat( + table.currentSnapshot().dataManifests(table.io()).stream() + .flatMap( + m -> + StreamSupport.stream( + ManifestFiles.read(m, table.io(), table.specs()).spliterator(), false)) + .count()) + .isEqualTo(expectedDataFileNum); + assertThat( + table.currentSnapshot().deleteManifests(table.io()).stream() + .flatMap( + m -> + StreamSupport.stream( + ManifestFiles.readDeleteManifest(m, table.io(), table.specs()) + .spliterator(), + false)) + .count()) + .isEqualTo(expectedDeleteFileNum); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFilesConfig.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFilesConfig.java new file mode 100644 index 000000000000..665a82ea15bb --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFilesConfig.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.time.Duration; +import java.util.Map; +import org.apache.flink.configuration.Configuration; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.maintenance.operator.OperatorTestBase; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +public class TestRewriteDataFilesConfig extends OperatorTestBase { + private Table table; + private Map input = Maps.newHashMap(); + + @BeforeEach + public void before() { + this.table = createTable(); + input.put( + RewriteDataFilesConfig.PREFIX + + org.apache.iceberg.actions.RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, + "true"); + input.put( + RewriteDataFilesConfig.PREFIX + + org.apache.iceberg.actions.RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, + "5"); + input.put(RewriteDataFilesConfig.MAX_BYTES, "1024"); + input.put(RewriteDataFilesConfig.SCHEDULE_ON_COMMIT_COUNT, "10"); + input.put(RewriteDataFilesConfig.SCHEDULE_ON_DATA_FILE_COUNT, "20"); + input.put(RewriteDataFilesConfig.SCHEDULE_ON_DATA_FILE_SIZE, "30"); + input.put(RewriteDataFilesConfig.SCHEDULE_ON_INTERVAL_SECOND, "60"); + input.put("other.config", "should-be-ignored"); + } + + @AfterEach + public void after() { + input.clear(); + } + + @Test + void testConfigParsing() { + RewriteDataFilesConfig config = new RewriteDataFilesConfig(table, input, new Configuration()); + + assertThat(config.partialProgressEnable()).isTrue(); + assertThat(config.partialProgressMaxCommits()).isEqualTo(5); + assertThat(config.maxRewriteBytes()).isEqualTo(1024L); + assertThat(config.scheduleOnCommitCount()).isEqualTo(10); + assertThat(config.scheduleOnDataFileCount()).isEqualTo(20); + assertThat(config.scheduleOnDataFileSize()).isEqualTo(30); + assertThat(config.scheduleOnIntervalSecond()).isEqualTo(60); + } + + @Test + void testEmptyConfig() { + RewriteDataFilesConfig config = + new RewriteDataFilesConfig(table, Maps.newHashMap(), new Configuration()); + + assertThat(config.partialProgressEnable()) + .isEqualTo(org.apache.iceberg.actions.RewriteDataFiles.PARTIAL_PROGRESS_ENABLED_DEFAULT); + assertThat(config.partialProgressMaxCommits()) + .isEqualTo( + org.apache.iceberg.actions.RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS_DEFAULT); + assertThat(config.maxRewriteBytes()).isEqualTo(Long.MAX_VALUE); + assertThat(config.scheduleOnCommitCount()) + .isEqualTo(RewriteDataFilesConfig.SCHEDULE_ON_COMMIT_COUNT_OPTION.defaultValue()); + assertThat(config.scheduleOnDataFileCount()) + .isEqualTo(RewriteDataFilesConfig.SCHEDULE_ON_DATA_FILE_COUNT_OPTION.defaultValue()); + assertThat(config.scheduleOnDataFileSize()) + .isEqualTo(RewriteDataFilesConfig.SCHEDULE_ON_DATA_FILE_SIZE_OPTION.defaultValue()); + assertThat(config.scheduleOnIntervalSecond()) + .isEqualTo(RewriteDataFilesConfig.SCHEDULE_ON_INTERVAL_SECOND_OPTION.defaultValue()); + } + + @Test + void testPropertiesMethodWithAllConfigs() { + RewriteDataFilesConfig config = new RewriteDataFilesConfig(table, input, new Configuration()); + + // check the config about the rewriter + assertThat(config.partialProgressEnable()).isTrue(); + assertThat(config.partialProgressMaxCommits()).isEqualTo(5); + assertThat(config.maxRewriteBytes()).isEqualTo(1024L); + + // check the config about the schedule + assertThat(config.scheduleOnCommitCount()).isEqualTo(10); + assertThat(config.scheduleOnDataFileCount()).isEqualTo(20); + assertThat(config.scheduleOnDataFileSize()).isEqualTo(30); + assertThat(config.scheduleOnIntervalSecond()).isEqualTo(Duration.ofSeconds(60).toSeconds()); + + assertThat(config.properties()) + .doesNotContainKey("custom.option") + .containsEntry("partial-progress.enabled", "true") + .containsEntry("partial-progress.max-commits", "5") + .containsEntry("max-bytes", "1024") + .containsEntry("schedule.commit-count", "10") + .containsEntry("schedule.data-file-count", "20") + .containsEntry("schedule.data-file-size", "30") + .containsEntry("schedule.interval-second", "60"); + } + + @Test + void testPropertiesWithDefaultConfig() { + RewriteDataFilesConfig config = + new RewriteDataFilesConfig(table, Maps.newHashMap(), new Configuration()); + + // check the config about the rewriter + assertThat(config.partialProgressEnable()).isFalse(); + assertThat(config.partialProgressMaxCommits()) + .isEqualTo(RewriteDataFilesConfig.PARTIAL_PROGRESS_MAX_COMMITS_OPTION.defaultValue()); + assertThat(config.maxRewriteBytes()).isEqualTo(Long.MAX_VALUE); + + // check the config about the schedule + assertThat(config.scheduleOnCommitCount()) + .isEqualTo(RewriteDataFilesConfig.SCHEDULE_ON_COMMIT_COUNT_OPTION.defaultValue()); + assertThat(config.scheduleOnDataFileCount()) + .isEqualTo(RewriteDataFilesConfig.SCHEDULE_ON_DATA_FILE_COUNT_OPTION.defaultValue()); + assertThat(config.scheduleOnDataFileSize()) + .isEqualTo(RewriteDataFilesConfig.SCHEDULE_ON_DATA_FILE_SIZE_OPTION.defaultValue()); + assertThat(config.scheduleOnIntervalSecond()) + .isEqualTo(RewriteDataFilesConfig.SCHEDULE_ON_INTERVAL_SECOND_OPTION.defaultValue()); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestTableMaintenance.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestTableMaintenance.java new file mode 100644 index 000000000000..eaa5b5e1b5b1 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestTableMaintenance.java @@ -0,0 +1,462 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import static org.apache.iceberg.flink.SimpleDataUtil.createRowData; +import static org.apache.iceberg.flink.maintenance.api.TableMaintenance.LOCK_REMOVER_OPERATOR_NAME; +import static org.apache.iceberg.flink.maintenance.api.TableMaintenance.SOURCE_OPERATOR_NAME_PREFIX; +import static org.apache.iceberg.flink.maintenance.api.TableMaintenance.TRIGGER_MANAGER_OPERATOR_NAME; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.CONCURRENT_RUN_THROTTLED; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.FAILED_TASK_COUNTER; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.NOTHING_TO_TRIGGER; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.RATE_LIMITER_TRIGGERED; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.SUCCEEDED_TASK_COUNTER; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.TRIGGERED; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.io.Serializable; +import java.time.Duration; +import java.util.Collections; +import java.util.List; +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.dag.Transformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.ResultTypeQueryable; +import org.apache.flink.configuration.CheckpointingOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.transformations.SourceTransformation; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.maintenance.operator.ManualSource; +import org.apache.iceberg.flink.maintenance.operator.MetricsReporterFactoryForTests; +import org.apache.iceberg.flink.maintenance.operator.OperatorTestBase; +import org.apache.iceberg.flink.maintenance.operator.TableChange; +import org.apache.iceberg.flink.sink.FlinkSink; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +class TestTableMaintenance extends OperatorTestBase { + private static final String MAINTENANCE_TASK_NAME = "TestTableMaintenance"; + private static final String[] TASKS = + new String[] {MAINTENANCE_TASK_NAME + " [0]", MAINTENANCE_TASK_NAME + " [1]"}; + private static final TableChange DUMMY_CHANGE = TableChange.builder().commitCount(1).build(); + private static final List PROCESSED = + Collections.synchronizedList(Lists.newArrayListWithCapacity(1)); + + private StreamExecutionEnvironment env; + private Table table; + + @TempDir private File checkpointDir; + + @BeforeEach + public void beforeEach() throws IOException { + Configuration config = new Configuration(); + config.set(CheckpointingOptions.CHECKPOINT_STORAGE, "filesystem"); + config.set(CheckpointingOptions.CHECKPOINTS_DIRECTORY, "file://" + checkpointDir.getPath()); + this.env = StreamExecutionEnvironment.getExecutionEnvironment(config); + this.table = createTable(); + insert(table, 1, "a"); + + PROCESSED.clear(); + MaintenanceTaskBuilderForTest.counter = 0; + } + + @Test + void testForChangeStream() throws Exception { + ManualSource schedulerSource = + new ManualSource<>(env, TypeInformation.of(TableChange.class)); + + TableMaintenance.Builder streamBuilder = + TableMaintenance.forChangeStream(schedulerSource.dataStream(), tableLoader(), LOCK_FACTORY) + .rateLimit(Duration.ofMillis(2)) + .lockCheckDelay(Duration.ofSeconds(3)) + .add( + new MaintenanceTaskBuilderForTest(true) + .scheduleOnCommitCount(1) + .scheduleOnDataFileCount(2) + .scheduleOnDataFileSize(3L) + .scheduleOnEqDeleteFileCount(4) + .scheduleOnEqDeleteRecordCount(5L) + .scheduleOnPosDeleteFileCount(6) + .scheduleOnPosDeleteRecordCount(7L) + .scheduleOnInterval(Duration.ofHours(1))); + + sendEvents(schedulerSource, streamBuilder, ImmutableList.of(Tuple2.of(DUMMY_CHANGE, 1))); + } + + @Test + void testForTable() throws Exception { + TableLoader tableLoader = tableLoader(); + + env.enableCheckpointing(10); + + TableMaintenance.forTable(env, tableLoader, LOCK_FACTORY) + .rateLimit(Duration.ofMillis(2)) + .maxReadBack(2) + .add(new MaintenanceTaskBuilderForTest(true).scheduleOnCommitCount(2)) + .append(); + + // Creating a stream for inserting data into the table concurrently + ManualSource insertSource = + new ManualSource<>(env, InternalTypeInfo.of(FlinkSchemaUtil.convert(table.schema()))); + FlinkSink.forRowData(insertSource.dataStream()) + .tableLoader(tableLoader) + .uidPrefix(UID_SUFFIX + "-iceberg-sink") + .append(); + + JobClient jobClient = null; + try { + jobClient = env.executeAsync(); + + insertSource.sendRecord(createRowData(2, "b")); + + Awaitility.await().until(() -> PROCESSED.size() == 1); + } finally { + closeJobClient(jobClient); + } + } + + @Test + void testLocking() throws Exception { + TriggerLockFactory.Lock lock = LOCK_FACTORY.createLock(); + + ManualSource schedulerSource = + new ManualSource<>(env, TypeInformation.of(TableChange.class)); + + TableMaintenance.Builder streamBuilder = + TableMaintenance.forChangeStream(schedulerSource.dataStream(), tableLoader(), LOCK_FACTORY) + .rateLimit(Duration.ofMillis(2)) + .add(new MaintenanceTaskBuilderForTest(true).scheduleOnCommitCount(1)); + + assertThat(lock.isHeld()).isFalse(); + sendEvents(schedulerSource, streamBuilder, ImmutableList.of(Tuple2.of(DUMMY_CHANGE, 1))); + + assertThat(lock.isHeld()).isFalse(); + } + + @Test + void testMetrics() throws Exception { + ManualSource schedulerSource = + new ManualSource<>(env, TypeInformation.of(TableChange.class)); + + TableMaintenance.Builder streamBuilder = + TableMaintenance.forChangeStream(schedulerSource.dataStream(), tableLoader(), LOCK_FACTORY) + .rateLimit(Duration.ofMillis(2)) + .lockCheckDelay(Duration.ofMillis(2)) + .add(new MaintenanceTaskBuilderForTest(true).scheduleOnCommitCount(1)) + .add(new MaintenanceTaskBuilderForTest(false).scheduleOnCommitCount(2)); + + sendEvents( + schedulerSource, + streamBuilder, + ImmutableList.of(Tuple2.of(DUMMY_CHANGE, 1), Tuple2.of(DUMMY_CHANGE, 2))); + + Awaitility.await() + .until( + () -> + MetricsReporterFactoryForTests.counter( + ImmutableList.of( + LOCK_REMOVER_OPERATOR_NAME, + table.name(), + TASKS[0], + "0", + SUCCEEDED_TASK_COUNTER)) + .equals(2L)); + + MetricsReporterFactoryForTests.assertCounters( + new ImmutableMap.Builder, Long>() + .put( + ImmutableList.of( + LOCK_REMOVER_OPERATOR_NAME, + table.name(), + TASKS[0], + "0", + SUCCEEDED_TASK_COUNTER), + 2L) + .put( + ImmutableList.of( + LOCK_REMOVER_OPERATOR_NAME, table.name(), TASKS[0], "0", FAILED_TASK_COUNTER), + 0L) + .put( + ImmutableList.of( + TRIGGER_MANAGER_OPERATOR_NAME, table.name(), TASKS[0], "0", TRIGGERED), + 2L) + .put( + ImmutableList.of( + LOCK_REMOVER_OPERATOR_NAME, + table.name(), + TASKS[1], + "1", + SUCCEEDED_TASK_COUNTER), + 0L) + .put( + ImmutableList.of( + LOCK_REMOVER_OPERATOR_NAME, table.name(), TASKS[1], "1", FAILED_TASK_COUNTER), + 1L) + .put( + ImmutableList.of( + TRIGGER_MANAGER_OPERATOR_NAME, table.name(), TASKS[1], "1", TRIGGERED), + 1L) + .put( + ImmutableList.of(TRIGGER_MANAGER_OPERATOR_NAME, table.name(), NOTHING_TO_TRIGGER), + -1L) + .put( + ImmutableList.of( + TRIGGER_MANAGER_OPERATOR_NAME, table.name(), CONCURRENT_RUN_THROTTLED), + -1L) + .put( + ImmutableList.of( + TRIGGER_MANAGER_OPERATOR_NAME, table.name(), RATE_LIMITER_TRIGGERED), + -1L) + .build()); + } + + @Test + void testUidAndSlotSharingGroup() throws IOException { + TableMaintenance.forChangeStream( + new ManualSource<>(env, TypeInformation.of(TableChange.class)).dataStream(), + tableLoader(), + LOCK_FACTORY) + .uidSuffix(UID_SUFFIX) + .slotSharingGroup(SLOT_SHARING_GROUP) + .add( + new MaintenanceTaskBuilderForTest(true) + .scheduleOnCommitCount(1) + .uidSuffix(UID_SUFFIX) + .slotSharingGroup(SLOT_SHARING_GROUP)) + .append(); + + checkUidsAreSet(env, UID_SUFFIX); + checkSlotSharingGroupsAreSet(env, SLOT_SHARING_GROUP); + } + + @Test + void testUidAndSlotSharingGroupUnset() throws IOException { + TableMaintenance.forChangeStream( + new ManualSource<>(env, TypeInformation.of(TableChange.class)).dataStream(), + tableLoader(), + LOCK_FACTORY) + .add(new MaintenanceTaskBuilderForTest(true).scheduleOnCommitCount(1)) + .append(); + + checkUidsAreSet(env, null); + checkSlotSharingGroupsAreSet(env, null); + } + + @Test + void testUidAndSlotSharingGroupInherit() throws IOException { + TableMaintenance.forChangeStream( + new ManualSource<>(env, TypeInformation.of(TableChange.class)).dataStream(), + tableLoader(), + LOCK_FACTORY) + .uidSuffix(UID_SUFFIX) + .slotSharingGroup(SLOT_SHARING_GROUP) + .add(new MaintenanceTaskBuilderForTest(true).scheduleOnCommitCount(1)) + .append(); + + checkUidsAreSet(env, UID_SUFFIX); + checkSlotSharingGroupsAreSet(env, SLOT_SHARING_GROUP); + } + + @Test + void testUidAndSlotSharingGroupOverWrite() throws IOException { + String anotherUid = "Another-UID"; + String anotherSlotSharingGroup = "Another-SlotSharingGroup"; + TableMaintenance.forChangeStream( + new ManualSource<>(env, TypeInformation.of(TableChange.class)).dataStream(), + tableLoader(), + LOCK_FACTORY) + .uidSuffix(UID_SUFFIX) + .slotSharingGroup(SLOT_SHARING_GROUP) + .add( + new MaintenanceTaskBuilderForTest(true) + .scheduleOnCommitCount(1) + .uidSuffix(anotherUid) + .slotSharingGroup(anotherSlotSharingGroup)) + .append(); + + // Choose an operator from the scheduler part of the graph + Transformation schedulerTransformation = + env.getTransformations().stream() + .filter(t -> t.getName().equals("Trigger manager")) + .findFirst() + .orElseThrow(); + assertThat(schedulerTransformation.getUid()).contains(UID_SUFFIX); + assertThat(schedulerTransformation.getSlotSharingGroup()).isPresent(); + assertThat(schedulerTransformation.getSlotSharingGroup().get().getName()) + .isEqualTo(SLOT_SHARING_GROUP); + + // Choose an operator from the maintenance task part of the graph + Transformation scheduledTransformation = + env.getTransformations().stream() + .filter(t -> t.getName().startsWith(MAINTENANCE_TASK_NAME)) + .findFirst() + .orElseThrow(); + assertThat(scheduledTransformation.getUid()).contains(anotherUid); + assertThat(scheduledTransformation.getSlotSharingGroup()).isPresent(); + assertThat(scheduledTransformation.getSlotSharingGroup().get().getName()) + .isEqualTo(anotherSlotSharingGroup); + } + + @Test + void testUidAndSlotSharingGroupForMonitorSource() throws IOException { + TableMaintenance.forTable(env, tableLoader(), LOCK_FACTORY) + .uidSuffix(UID_SUFFIX) + .slotSharingGroup(SLOT_SHARING_GROUP) + .add( + new MaintenanceTaskBuilderForTest(true) + .scheduleOnCommitCount(1) + .uidSuffix(UID_SUFFIX) + .slotSharingGroup(SLOT_SHARING_GROUP)) + .append(); + + Transformation source = monitorSource(); + assertThat(source).isNotNull(); + assertThat(source.getUid()).contains(UID_SUFFIX); + assertThat(source.getSlotSharingGroup()).isPresent(); + assertThat(source.getSlotSharingGroup().get().getName()).isEqualTo(SLOT_SHARING_GROUP); + + checkUidsAreSet(env, UID_SUFFIX); + checkSlotSharingGroupsAreSet(env, SLOT_SHARING_GROUP); + } + + /** + * Sends the events though the {@link ManualSource} provided, and waits until the given number of + * records are processed. + * + * @param schedulerSource used for sending the events + * @param streamBuilder used for generating the job + * @param eventsAndResultNumbers the pair of the event and the expected processed records + * @throws Exception if any + */ + private void sendEvents( + ManualSource schedulerSource, + TableMaintenance.Builder streamBuilder, + List> eventsAndResultNumbers) + throws Exception { + streamBuilder.append(); + + JobClient jobClient = null; + try { + jobClient = env.executeAsync(); + + eventsAndResultNumbers.forEach( + eventsAndResultNumber -> { + int expectedSize = PROCESSED.size() + eventsAndResultNumber.f1; + schedulerSource.sendRecord(eventsAndResultNumber.f0); + Awaitility.await() + .until( + () -> PROCESSED.size() == expectedSize && !LOCK_FACTORY.createLock().isHeld()); + }); + } finally { + closeJobClient(jobClient); + } + } + + /** + * Finds the {@link org.apache.iceberg.flink.maintenance.operator.MonitorSource} for testing + * purposes by parsing the transformation tree. + * + * @return The monitor source if we found it + */ + private Transformation monitorSource() { + assertThat(env.getTransformations()).isNotEmpty(); + assertThat(env.getTransformations().get(0).getInputs()).isNotEmpty(); + assertThat(env.getTransformations().get(0).getInputs().get(0).getInputs()).isNotEmpty(); + + Transformation result = + env.getTransformations().get(0).getInputs().get(0).getInputs().get(0); + + // Some checks to make sure this is the transformation we are looking for + assertThat(result).isInstanceOf(SourceTransformation.class); + assertThat(result.getName()).startsWith(SOURCE_OPERATOR_NAME_PREFIX); + + return result; + } + + private static class MaintenanceTaskBuilderForTest + extends MaintenanceTaskBuilder { + private final boolean success; + private final int id; + private static int counter = 0; + + MaintenanceTaskBuilderForTest(boolean success) { + this.success = success; + this.id = counter; + ++counter; + } + + @Override + String maintenanceTaskName() { + return MAINTENANCE_TASK_NAME; + } + + @Override + DataStream append(DataStream trigger) { + String name = TASKS[id]; + return trigger + .map(new DummyMaintenanceTask(success)) + .name(name) + .uid(uidSuffix() + "-test-mapper-" + name + "-" + id) + .slotSharingGroup(slotSharingGroup()) + .forceNonParallel(); + } + } + + private static class DummyMaintenanceTask + implements MapFunction, ResultTypeQueryable, Serializable { + private final boolean success; + + private DummyMaintenanceTask(boolean success) { + this.success = success; + } + + @Override + public TaskResult map(Trigger trigger) { + // Ensure that the lock is held when processing + assertThat(LOCK_FACTORY.createLock().isHeld()).isTrue(); + PROCESSED.add(trigger); + + return new TaskResult( + trigger.taskId(), + trigger.timestamp(), + success, + success ? Collections.emptyList() : Lists.newArrayList(new Exception("Testing error"))); + } + + @Override + public TypeInformation getProducedType() { + return TypeInformation.of(TaskResult.class); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestZkLockFactory.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestZkLockFactory.java new file mode 100644 index 000000000000..f1313c89ae53 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestZkLockFactory.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.api; + +import java.io.IOException; +import org.apache.curator.test.TestingServer; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; + +public class TestZkLockFactory extends TestLockFactoryBase { + + private TestingServer zkTestServer; + + @Override + TriggerLockFactory lockFactory(String tableName) { + return new ZkLockFactory(zkTestServer.getConnectString(), tableName, 5000, 3000, 1000, 3); + } + + @BeforeEach + @Override + void before() { + try { + zkTestServer = new TestingServer(); + } catch (Exception e) { + throw new RuntimeException(e); + } + + super.before(); + } + + @AfterEach + public void after() throws IOException { + super.after(); + if (zkTestServer != null) { + zkTestServer.close(); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java new file mode 100644 index 000000000000..e09e312be1dd --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.time.Duration; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.sink2.SinkWriter; +import org.apache.flink.api.connector.sink2.WriterInitContext; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +/** Sink for collecting output during testing. */ +public class CollectingSink implements Sink { + private static final long serialVersionUID = 1L; + private static final List> QUEUES = + Collections.synchronizedList(Lists.newArrayListWithExpectedSize(1)); + private static final AtomicInteger NUM_SINKS = new AtomicInteger(-1); + private final int index; + + /** Creates a new sink which collects the elements received. */ + public CollectingSink() { + this.index = NUM_SINKS.incrementAndGet(); + QUEUES.add(new LinkedBlockingQueue<>()); + } + + /** + * Gets all the remaining output received by this {@link Sink}. + * + * @return all the remaining output + */ + List remainingOutput() { + return Lists.newArrayList((BlockingQueue) QUEUES.get(this.index)); + } + + /** + * Check if there is no remaining output received by this {@link Sink}. + * + * @return true if there is no remaining output + */ + boolean isEmpty() { + return QUEUES.get(this.index).isEmpty(); + } + + /** + * Wait until the next element received by the {@link Sink}. + * + * @param timeout for the poll + * @return The first element received by this {@link Sink} + * @throws TimeoutException if no element received until the timeout + */ + public T poll(Duration timeout) throws TimeoutException { + Object element; + + try { + element = QUEUES.get(this.index).poll(timeout.toMillis(), TimeUnit.MILLISECONDS); + } catch (InterruptedException var4) { + throw new RuntimeException(var4); + } + + if (element == null) { + throw new TimeoutException(); + } else { + return (T) element; + } + } + + @Override + public SinkWriter createWriter(WriterInitContext context) { + return new CollectingWriter<>(index); + } + + private static class CollectingWriter implements SinkWriter { + private final int index; + + CollectingWriter(int index) { + this.index = index; + } + + @Override + public void write(T element, Context context) { + QUEUES.get(index).add(element); + } + + @Override + public void flush(boolean endOfInput) { + // Nothing to do here + } + + @Override + public void close() { + // Nothing to do here + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java new file mode 100644 index 000000000000..eff32fcfa118 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java @@ -0,0 +1,320 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.util.ArrayDeque; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import org.apache.flink.api.common.eventtime.Watermark; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.source.Boundedness; +import org.apache.flink.api.connector.source.ReaderOutput; +import org.apache.flink.api.connector.source.Source; +import org.apache.flink.api.connector.source.SourceReader; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.api.connector.source.SourceSplit; +import org.apache.flink.api.connector.source.SplitEnumerator; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.ResultTypeQueryable; +import org.apache.flink.core.io.InputStatus; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Queues; +import org.jetbrains.annotations.Nullable; + +/** Testing source implementation for Flink sources which can be triggered manually. */ +public class ManualSource + implements Source, + ResultTypeQueryable { + + private static final long serialVersionUID = 1L; + private static final List>> QUEUES = + Collections.synchronizedList(Lists.newArrayList()); + private static final List> AVAILABILITIES = + Collections.synchronizedList(Lists.newArrayList()); + private static int numSources = 0; + private final TypeInformation type; + private final int index; + private transient DataStream stream; + private final transient StreamExecutionEnvironment env; + + /** + * Creates a new source for testing. + * + * @param env to register the source + * @param type of the events returned by the source + */ + public ManualSource(StreamExecutionEnvironment env, TypeInformation type) { + this.type = type; + this.env = env; + this.index = numSources++; + QUEUES.add(Queues.newArrayDeque()); + AVAILABILITIES.add(new CompletableFuture<>()); + } + + /** + * Emit a new record from the source. + * + * @param event to emit + */ + public void sendRecord(T event) { + this.sendInternal(Tuple2.of(event, null)); + } + + /** + * Emit a new record with the given event time from the source. + * + * @param event to emit + * @param eventTime of the event + */ + public void sendRecord(T event, long eventTime) { + this.sendInternal(Tuple2.of(event, eventTime)); + } + + /** + * Emit a watermark from the source. + * + * @param timeStamp of the watermark + */ + public void sendWatermark(long timeStamp) { + this.sendInternal(Tuple2.of(null, timeStamp)); + } + + /** Mark the source as finished. */ + void markFinished() { + this.sendWatermark(Long.MAX_VALUE); + this.sendInternal(Tuple2.of(null, null)); + } + + /** + * Get the {@link DataStream} for this source. + * + * @return the stream emitted by this source + */ + public DataStream dataStream() { + if (this.stream == null) { + this.stream = + this.env + .fromSource(this, WatermarkStrategy.noWatermarks(), "ManualSource-" + index, type) + .forceNonParallel(); + } + + return this.stream; + } + + private void sendInternal(Tuple2 tuple) { + QUEUES.get(index).offer(tuple); + AVAILABILITIES.get(index).complete(null); + } + + @Override + public Boundedness getBoundedness() { + return Boundedness.CONTINUOUS_UNBOUNDED; + } + + @Override + public SplitEnumerator createEnumerator( + SplitEnumeratorContext enumContext) { + return new DummyCheckpointEnumerator(); + } + + @Override + public SplitEnumerator restoreEnumerator( + SplitEnumeratorContext enumContext, DummyCheckpoint checkpoint) { + return new DummyCheckpointEnumerator(); + } + + @Override + public SimpleVersionedSerializer getSplitSerializer() { + return new NoOpDummySplitSerializer(); + } + + @Override + public SimpleVersionedSerializer getEnumeratorCheckpointSerializer() { + return new NoOpDummyCheckpointSerializer(); + } + + @Override + public SourceReader createReader(SourceReaderContext sourceReaderContext) { + return new SourceReader<>() { + @Override + public void start() { + // Do nothing + } + + @SuppressWarnings("unchecked") + @Override + public InputStatus pollNext(ReaderOutput output) { + Tuple2 next = (Tuple2) QUEUES.get(index).poll(); + + if (next != null) { + if (next.f0 == null) { + if (next.f1 == null) { + // No more input + return InputStatus.END_OF_INPUT; + } else { + output.emitWatermark(new Watermark(next.f1)); + } + } else if (next.f1 == null) { + // No event time set + output.collect(next.f0); + } else { + // With event time + output.collect(next.f0, next.f1); + } + } + + AVAILABILITIES.set(index, new CompletableFuture<>()); + return QUEUES.get(index).isEmpty() + ? InputStatus.NOTHING_AVAILABLE + : InputStatus.MORE_AVAILABLE; + } + + @Override + public List snapshotState(long checkpointId) { + return Lists.newArrayList(new DummySplit()); + } + + @Override + public CompletableFuture isAvailable() { + return AVAILABILITIES.get(index); + } + + @Override + public void addSplits(List splits) { + // do nothing + } + + @Override + public void notifyNoMoreSplits() { + // do nothing + } + + @Override + public void close() { + // do nothing + } + }; + } + + @Override + public TypeInformation getProducedType() { + return this.type; + } + + /** + * Placeholder because the ManualSource itself implicitly represents the only split and does not + * require an actual split object. + */ + public static class DummySplit implements SourceSplit { + @Override + public String splitId() { + return "dummy"; + } + } + + /** + * Placeholder because the ManualSource does not support fault-tolerance and thus does not require + * actual checkpointing. + */ + public static class DummyCheckpoint {} + + /** Placeholder because the ManualSource does not need enumeration, but checkpointing needs it. */ + private static class DummyCheckpointEnumerator + implements SplitEnumerator { + + @Override + public void start() { + // do nothing + } + + @Override + public void handleSplitRequest(int subtaskId, @Nullable String requesterHostname) { + // do nothing + } + + @Override + public void addSplitsBack(List splits, int subtaskId) { + // do nothing + } + + @Override + public void addReader(int subtaskId) { + // do nothing + } + + @Override + public DummyCheckpoint snapshotState(long checkpointId) { + return new DummyCheckpoint(); + } + + @Override + public void close() { + // do nothing + } + } + + /** + * Not used - only required to avoid NullPointerException. The split is not transferred from the + * enumerator, it is implicitly represented by the ManualSource. + */ + private static class NoOpDummySplitSerializer implements SimpleVersionedSerializer { + @Override + public int getVersion() { + return 0; + } + + @Override + public byte[] serialize(DummySplit split) { + return new byte[0]; + } + + @Override + public DummySplit deserialize(int version, byte[] serialized) { + return new DummySplit(); + } + } + + /** + * Not used - only required to avoid NullPointerException. The split is not transferred from the + * enumerator, it is implicitly represented by the ManualSource. + */ + private static class NoOpDummyCheckpointSerializer + implements SimpleVersionedSerializer { + @Override + public int getVersion() { + return 0; + } + + @Override + public byte[] serialize(DummyCheckpoint split) { + return new byte[0]; + } + + @Override + public DummyCheckpoint deserialize(int version, byte[] serialized) { + return new DummyCheckpoint(); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/MetricsReporterFactoryForTests.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/MetricsReporterFactoryForTests.java new file mode 100644 index 000000000000..ed66ff3df076 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/MetricsReporterFactoryForTests.java @@ -0,0 +1,192 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import org.apache.flink.metrics.Counter; +import org.apache.flink.metrics.Gauge; +import org.apache.flink.metrics.Metric; +import org.apache.flink.metrics.MetricConfig; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.metrics.reporter.MetricReporter; +import org.apache.flink.metrics.reporter.MetricReporterFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +public class MetricsReporterFactoryForTests implements MetricReporterFactory { + private static final TestMetricsReporter INSTANCE = new TestMetricsReporter(); + private static final Pattern TASK_METRIC_NAME = + Pattern.compile( + "\\.taskmanager\\.[^.]+\\.[^.]+\\.([^.]+)\\.\\d+\\." + + TableMaintenanceMetrics.GROUP_KEY + + "\\." + + TableMaintenanceMetrics.TABLE_NAME_KEY + + "\\.([^.]+)\\." + + TableMaintenanceMetrics.TASK_NAME_KEY + + "\\.([^.]+)\\." + + TableMaintenanceMetrics.TASK_INDEX_KEY + + "\\.([^.]+)\\.([^.]+)"); + + private static final Pattern MAIN_METRIC_NAME = + Pattern.compile( + "\\.taskmanager\\.[^.]+\\.[^.]+\\.([^.]+)\\.\\d+\\." + + TableMaintenanceMetrics.GROUP_KEY + + "\\." + + TableMaintenanceMetrics.TABLE_NAME_KEY + + "\\.([^.]+)\\.([^.]+)"); + + private static Map counters = Maps.newConcurrentMap(); + private static Map gauges = Maps.newConcurrentMap(); + private static Set monitoredMetricNames; + + public MetricsReporterFactoryForTests() { + monitoredMetricNames = + Arrays.stream(TableMaintenanceMetrics.class.getDeclaredFields()) + .map( + f -> { + try { + return f.get(null).toString(); + } catch (IllegalAccessException e) { + throw new RuntimeException(e); + } + }) + .collect(Collectors.toSet()); + } + + @Override + public MetricReporter createMetricReporter(Properties properties) { + return INSTANCE; + } + + public static void reset() { + counters = Maps.newConcurrentMap(); + gauges = Maps.newConcurrentMap(); + } + + public static Long counter(List parts) { + return counterValues().get(longName(parts)); + } + + public static Long gauge(List parts) { + return gaugeValues().get(longName(parts)); + } + + public static void assertGauges(Map, Long> expected) { + Map transformed = + expected.entrySet().stream() + .collect(Collectors.toMap(k -> longName(k.getKey()), Map.Entry::getValue)); + assertThat(filter(gaugeValues(), transformed)).isEqualTo(filter(transformed, transformed)); + } + + public static void assertCounters(Map, Long> expected) { + Map transformed = + expected.entrySet().stream() + .collect(Collectors.toMap(k -> longName(k.getKey()), Map.Entry::getValue)); + assertThat(filter(counterValues(), transformed)).isEqualTo(filter(transformed, transformed)); + } + + private static Map gaugeValues() { + return gauges.entrySet().stream() + .collect( + Collectors.toMap( + entry -> longName(entry.getKey()), entry -> (Long) entry.getValue().getValue())); + } + + private static Map counterValues() { + return counters.entrySet().stream() + .collect( + Collectors.toMap( + entry -> longName(entry.getKey()), entry -> entry.getValue().getCount())); + } + + private static Map filter(Map original, Map filter) { + return original.entrySet().stream() + .filter( + entry -> { + Long filterValue = filter.get(entry.getKey()); + return filterValue == null || filterValue != -1; + }) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } + + private static String longName(String fullName) { + Matcher mainMatcher = MAIN_METRIC_NAME.matcher(fullName); + Matcher taskMatcher = TASK_METRIC_NAME.matcher(fullName); + + if (taskMatcher.matches()) { + return taskMatcher.group(1) + + "." + + taskMatcher.group(2) + + "." + + taskMatcher.group(3) + + "." + + taskMatcher.group(4) + + "." + + taskMatcher.group(5); + } + + if (mainMatcher.matches()) { + return mainMatcher.group(1) + "." + mainMatcher.group(2) + "." + mainMatcher.group(3); + } + + throw new RuntimeException(String.format("Can't parse simplified metrics name %s", fullName)); + } + + private static String longName(List parts) { + return parts.stream().map(s -> s.replaceAll("\\.", "_")).collect(Collectors.joining(".")); + } + + private static class TestMetricsReporter implements MetricReporter { + @Override + public void open(MetricConfig config) { + // do nothing + } + + @Override + public void close() { + // do nothing + } + + @Override + public void notifyOfAddedMetric(Metric metric, String metricName, MetricGroup group) { + if (monitoredMetricNames.contains(metricName)) { + if (metric instanceof Counter) { + counters.put(group.getMetricIdentifier(metricName), (Counter) metric); + } + + if (metric instanceof Gauge) { + gauges.put(group.getMetricIdentifier(metricName), (Gauge) metric); + } + } + } + + @Override + public void notifyOfRemovedMetric(Metric metric, String metricName, MetricGroup group) { + // do nothing + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java new file mode 100644 index 000000000000..f9cbc9715cce --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java @@ -0,0 +1,397 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import javax.annotation.Nullable; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.MetricOptions; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.core.execution.SavepointFormatType; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.graph.StreamGraphGenerator; +import org.apache.flink.streaming.api.transformations.SinkTransformation; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Files; +import org.apache.iceberg.PartitionData; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.TestHelpers; +import org.apache.iceberg.data.FileHelpers; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.deletes.PositionDelete; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.apache.iceberg.flink.maintenance.api.TriggerLockFactory; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +public class OperatorTestBase { + private static final int NUMBER_TASK_MANAGERS = 1; + private static final int SLOTS_PER_TASK_MANAGER = 8; + private static final Schema SCHEMA_WITH_PRIMARY_KEY = + new Schema( + Lists.newArrayList( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())), + ImmutableMap.of(), + ImmutableSet.of(SimpleDataUtil.SCHEMA.columns().get(0).fieldId())); + + protected static final String UID_SUFFIX = "UID-Dummy"; + protected static final String SLOT_SHARING_GROUP = "SlotSharingGroup"; + protected static final TriggerLockFactory LOCK_FACTORY = new MemoryLockFactory(); + + public static final String IGNORED_OPERATOR_NAME = "Ignore"; + + static final long EVENT_TIME = 10L; + static final long EVENT_TIME_2 = 11L; + static final Watermark WATERMARK = new Watermark(EVENT_TIME); + protected static final String DUMMY_TASK_NAME = "dummyTask"; + protected static final String DUMMY_TABLE_NAME = "dummyTable"; + + static final String FILE_NAME_1 = "fileName1"; + static final String FILE_NAME_2 = "fileName2"; + static final Watermark WATERMARK_2 = new Watermark(EVENT_TIME_2); + + @RegisterExtension + protected static final MiniClusterExtension MINI_CLUSTER_EXTENSION = + new MiniClusterExtension( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(NUMBER_TASK_MANAGERS) + .setNumberSlotsPerTaskManager(SLOTS_PER_TASK_MANAGER) + .setConfiguration(config()) + .build()); + + @TempDir private Path warehouseDir; + + @RegisterExtension + private static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); + + @BeforeEach + void before() { + LOCK_FACTORY.open(); + LOCK_FACTORY.createLock().unlock(); + LOCK_FACTORY.createRecoveryLock().unlock(); + MetricsReporterFactoryForTests.reset(); + } + + @AfterEach + void after() throws IOException { + LOCK_FACTORY.close(); + } + + protected static Table createTable() { + // only test V2 tables as compaction doesn't support V3 with row lineage + return createTable("2"); + } + + protected static Table createTable(String formatVersion) { + return CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + PartitionSpec.unpartitioned(), + null, + ImmutableMap.of( + TableProperties.FORMAT_VERSION, + formatVersion, + "flink.max-continuous-empty-commits", + "100000")); + } + + protected static Table createTableWithDelete() { + return CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SCHEMA_WITH_PRIMARY_KEY, + PartitionSpec.unpartitioned(), + null, + ImmutableMap.of("format-version", "2", "write.upsert.enabled", "true")); + } + + protected static Table createPartitionedTable() { + return CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build(), + null, + ImmutableMap.of("format-version", "2", "flink.max-continuous-empty-commits", "100000")); + } + + protected void insert(Table table, Integer id, String data) throws IOException { + new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + .appendToTable(Lists.newArrayList(SimpleDataUtil.createRecord(id, data))); + table.refresh(); + } + + protected void insert(Table table, Integer id, String data, String extra) throws IOException { + new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + .appendToTable(Lists.newArrayList(SimpleDataUtil.createRecord(id, data, extra))); + table.refresh(); + } + + /** + * For the same identifier column id this methods simulate the following row operations:

  • + *
  • add an equality delete on oldData + *
  • insert newData
  • + * + * @param table to modify + * @param id the identifier column id + * @param oldData the old data to be deleted + * @param newData the new data to be inserted + */ + protected void update(Table table, Integer id, String oldData, String newData) + throws IOException { + DataFile dataFile = + new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + .writeFile(Lists.newArrayList(SimpleDataUtil.createRecord(id, newData))); + DeleteFile eqDelete = writeEqualityDelete(table, id, oldData); + + table.newRowDelta().addRows(dataFile).addDeletes(eqDelete).commit(); + } + + /** + * For the same identifier column id this methods simulate the following row operations: + *
  • add an equality delete on oldData + *
  • insert tempData + *
  • add a position delete on tempData + *
  • insert newData
  • + * + * @param table to modify + * @param id the identifier column id + * @param oldData the old data to be deleted + * @param tempData the temp data to be inserted and deleted with a position delete + * @param newData the new data to be inserted + */ + protected void update(Table table, Integer id, String oldData, String tempData, String newData) + throws IOException { + DataFile dataFile = + new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + .writeFile( + Lists.newArrayList( + SimpleDataUtil.createRecord(id, tempData), + SimpleDataUtil.createRecord(id, newData))); + DeleteFile eqDelete = writeEqualityDelete(table, id, oldData); + DeleteFile posDelete = writePosDelete(table, dataFile.path(), 0, id, tempData); + + table.newRowDelta().addRows(dataFile).addDeletes(eqDelete).addDeletes(posDelete).commit(); + } + + protected void insertPartitioned(Table table, Integer id, String data) throws IOException { + new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + .appendToTable( + TestHelpers.Row.of(data), Lists.newArrayList(SimpleDataUtil.createRecord(id, data))); + table.refresh(); + } + + protected void insertFullPartitioned(Table table, Integer id, String data) throws IOException { + new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + .appendToTable( + TestHelpers.Row.of(data, id), + Lists.newArrayList(SimpleDataUtil.createRecord(id, data))); + table.refresh(); + } + + protected void dropTable() { + CATALOG_EXTENSION.catalogLoader().loadCatalog().dropTable(TestFixtures.TABLE_IDENTIFIER); + } + + protected TableLoader tableLoader() { + return CATALOG_EXTENSION.tableLoader(); + } + + /** + * Close the {@link JobClient} and wait for the job closure. If the savepointDir is specified, it + * stops the job with a savepoint. + * + * @param jobClient the job to close + * @param savepointDir the savepointDir to store the last savepoint. If null then + * stop without a savepoint. + * @return configuration for restarting the job from the savepoint + */ + @Nullable + protected static String closeJobClient(JobClient jobClient, File savepointDir) { + if (jobClient != null) { + if (savepointDir != null) { + // Stop with savepoint + jobClient.stopWithSavepoint(false, savepointDir.getPath(), SavepointFormatType.CANONICAL); + // Wait until the savepoint is created and the job has been stopped + Awaitility.await().until(() -> savepointDir.listFiles(File::isDirectory).length == 1); + return savepointDir.listFiles(File::isDirectory)[0].getAbsolutePath(); + } else { + jobClient.cancel(); + } + + // Wait until the job has been stopped + Awaitility.await().until(() -> jobClient.getJobStatus().get().isTerminalState()); + } + + return null; + } + + /** + * Close the {@link JobClient} and wait for the job closure. + * + * @param jobClient the job to close + */ + protected static void closeJobClient(JobClient jobClient) { + closeJobClient(jobClient, null); + } + + protected static void checkUidsAreSet(StreamExecutionEnvironment env, String uidSuffix) { + env.getTransformations().stream() + .filter( + t -> !(t instanceof SinkTransformation) && !(t.getName().equals(IGNORED_OPERATOR_NAME))) + .forEach( + transformation -> { + assertThat(transformation.getUid()).isNotNull(); + if (uidSuffix != null) { + assertThat(transformation.getUid()).contains(UID_SUFFIX); + } + }); + } + + protected static void checkSlotSharingGroupsAreSet(StreamExecutionEnvironment env, String name) { + String nameToCheck = name != null ? name : StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP; + + env.getTransformations().stream() + .filter( + t -> !(t instanceof SinkTransformation) && !(t.getName().equals(IGNORED_OPERATOR_NAME))) + .forEach( + t -> { + assertThat(t.getSlotSharingGroup()).isPresent(); + assertThat(t.getSlotSharingGroup().get().getName()).isEqualTo(nameToCheck); + }); + } + + private static Configuration config() { + Configuration config = new Configuration(DISABLE_CLASSLOADER_CHECK_CONFIG); + MetricOptions.forReporter(config, "test_reporter") + .set(MetricOptions.REPORTER_FACTORY_CLASS, MetricsReporterFactoryForTests.class.getName()); + return config; + } + + private DeleteFile writeEqualityDelete(Table table, Integer id, String oldData) + throws IOException { + File file = File.createTempFile("junit", null, warehouseDir.toFile()); + assertThat(file.delete()).isTrue(); + return FileHelpers.writeDeleteFile( + table, + Files.localOutput(file), + new PartitionData(PartitionSpec.unpartitioned().partitionType()), + Lists.newArrayList(SimpleDataUtil.createRecord(id, oldData)), + SCHEMA_WITH_PRIMARY_KEY); + } + + private DeleteFile writePosDelete( + Table table, CharSequence path, Integer pos, Integer id, String oldData) throws IOException { + File file = File.createTempFile("junit", null, warehouseDir.toFile()); + assertThat(file.delete()).isTrue(); + PositionDelete posDelete = PositionDelete.create(); + GenericRecord nested = GenericRecord.create(table.schema()); + nested.set(0, id); + nested.set(1, oldData); + posDelete.set(path, pos, nested); + return FileHelpers.writePosDeleteFile( + table, Files.localOutput(file), null, Lists.newArrayList(posDelete)); + } + + static void trigger(OneInputStreamOperatorTestHarness harness) throws Exception { + long time = System.currentTimeMillis(); + harness.processElement(Trigger.create(time, 0), time); + } + + private static class MemoryLock implements TriggerLockFactory.Lock { + volatile boolean locked = false; + + @Override + public boolean tryLock() { + if (locked) { + return false; + } else { + locked = true; + return true; + } + } + + @Override + public boolean isHeld() { + return locked; + } + + @Override + public void unlock() { + locked = false; + } + } + + private static class MemoryLockFactory implements TriggerLockFactory { + private static final TriggerLockFactory.Lock MAINTENANCE_LOCK = new MemoryLock(); + private static final TriggerLockFactory.Lock RECOVERY_LOCK = new MemoryLock(); + + @Override + public void open() { + // do nothing + } + + @Override + public Lock createLock() { + return MAINTENANCE_LOCK; + } + + @Override + public Lock createRecoveryLock() { + return RECOVERY_LOCK; + } + + @Override + public void close() { + // do nothing + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java new file mode 100644 index 000000000000..68aaf29ac0d1 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.apache.iceberg.actions.SizeBasedFileRewritePlanner.MIN_INPUT_FILES; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.Set; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.streaming.util.ProcessFunctionTestHarnesses; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.Table; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; + +class RewriteUtil { + private RewriteUtil() {} + + static List planDataFileRewrite(TableLoader tableLoader) + throws Exception { + try (OneInputStreamOperatorTestHarness + testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new DataFileRewritePlanner( + OperatorTestBase.DUMMY_TABLE_NAME, + OperatorTestBase.DUMMY_TABLE_NAME, + 0, + tableLoader, + 11, + 10_000_000L, + ImmutableMap.of(MIN_INPUT_FILES, "2"), + Expressions.alwaysTrue()))) { + testHarness.open(); + + OperatorTestBase.trigger(testHarness); + + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + return testHarness.extractOutputValues(); + } + } + + static List executeRewrite( + List elements) throws Exception { + try (OneInputStreamOperatorTestHarness< + DataFileRewritePlanner.PlannedGroup, DataFileRewriteRunner.ExecutedGroup> + testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new DataFileRewriteRunner( + OperatorTestBase.DUMMY_TABLE_NAME, OperatorTestBase.DUMMY_TABLE_NAME, 0))) { + testHarness.open(); + + for (DataFileRewritePlanner.PlannedGroup element : elements) { + testHarness.processElement(element, System.currentTimeMillis()); + } + + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + return testHarness.extractOutputValues(); + } + } + + static Set newDataFiles(Table table) { + table.refresh(); + return Sets.newHashSet(table.currentSnapshot().addedDataFiles(table.io())); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteCommitter.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteCommitter.java new file mode 100644 index 000000000000..9e8f2ec92162 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteCommitter.java @@ -0,0 +1,278 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.apache.iceberg.flink.maintenance.operator.RewriteUtil.executeRewrite; +import static org.apache.iceberg.flink.maintenance.operator.RewriteUtil.planDataFileRewrite; +import static org.apache.iceberg.metrics.CommitMetricsResult.TOTAL_DATA_FILES; +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; + +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.junit.jupiter.api.Test; + +class TestDataFileRewriteCommitter extends OperatorTestBase { + @Test + void testUnpartitioned() throws Exception { + Table table = createTable(); + insert(table, 1, "p1"); + insert(table, 2, "p2"); + insert(table, 3, "p3"); + + List planned = planDataFileRewrite(tableLoader()); + assertThat(planned).hasSize(1); + List rewritten = executeRewrite(planned); + assertThat(rewritten).hasSize(1); + + try (OneInputStreamOperatorTestHarness + testHarness = harness()) { + testHarness.open(); + + testHarness.processElement(rewritten.get(0), EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + + testHarness.processWatermark(EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + } + + assertDataFiles( + table, rewritten.get(0).group().addedFiles(), rewritten.get(0).group().rewrittenFiles(), 1); + } + + @Test + void testPartitioned() throws Exception { + Table table = createPartitionedTable(); + insertPartitioned(table, 1, "p1"); + insertPartitioned(table, 2, "p1"); + insertPartitioned(table, 3, "p2"); + insertPartitioned(table, 4, "p2"); + + List planned = planDataFileRewrite(tableLoader()); + assertThat(planned).hasSize(2); + List rewritten = executeRewrite(planned); + assertThat(rewritten).hasSize(2); + assertThat(rewritten.get(0).groupsPerCommit()).isEqualTo(1); + assertThat(rewritten.get(1).groupsPerCommit()).isEqualTo(1); + ensureDifferentGroups(rewritten); + + try (OneInputStreamOperatorTestHarness + testHarness = harness()) { + testHarness.open(); + + testHarness.processElement(rewritten.get(0), EVENT_TIME); + assertDataFiles( + table, + rewritten.get(0).group().addedFiles(), + rewritten.get(0).group().rewrittenFiles(), + 3); + + testHarness.processElement(rewritten.get(1), EVENT_TIME); + assertDataFiles( + table, + rewritten.get(1).group().addedFiles(), + rewritten.get(1).group().rewrittenFiles(), + 2); + + assertThat(testHarness.extractOutputValues()).isEmpty(); + + testHarness.processWatermark(EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + } + } + + @Test + void testNewTable() throws Exception { + Table table = createTable(); + List rewritten; + + try (OneInputStreamOperatorTestHarness + testHarness = harness()) { + testHarness.open(); + + insert(table, 1, "p1"); + insert(table, 2, "p2"); + insert(table, 3, "p3"); + + List planned = planDataFileRewrite(tableLoader()); + assertThat(planned).hasSize(1); + rewritten = executeRewrite(planned); + assertThat(rewritten).hasSize(1); + + testHarness.processElement(rewritten.get(0), EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + + testHarness.processWatermark(EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + } + + assertDataFiles( + table, rewritten.get(0).group().addedFiles(), rewritten.get(0).group().rewrittenFiles(), 1); + } + + @Test + void testBatchSize() throws Exception { + Table table = createPartitionedTable(); + insertPartitioned(table, 1, "p1"); + insertPartitioned(table, 2, "p1"); + insertPartitioned(table, 3, "p2"); + insertPartitioned(table, 4, "p2"); + insertPartitioned(table, 5, "p3"); + insertPartitioned(table, 6, "p3"); + + List planned = planDataFileRewrite(tableLoader()); + assertThat(planned).hasSize(3); + List rewritten = executeRewrite(planned); + assertThat(rewritten).hasSize(3); + ensureDifferentGroups(rewritten); + + try (OneInputStreamOperatorTestHarness + testHarness = harness()) { + testHarness.open(); + + testHarness.processElement(setBatchSizeToTwo(rewritten.get(0)), EVENT_TIME); + assertNoChange(table); + testHarness.processElement(setBatchSizeToTwo(rewritten.get(1)), EVENT_TIME); + + Set added = Sets.newHashSet(rewritten.get(0).group().addedFiles()); + added.addAll(rewritten.get(1).group().addedFiles()); + Set removed = Sets.newHashSet(rewritten.get(0).group().rewrittenFiles()); + removed.addAll(rewritten.get(1).group().rewrittenFiles()); + assertDataFiles(table, added, removed, 4); + + testHarness.processElement(setBatchSizeToTwo(rewritten.get(2)), EVENT_TIME); + assertNoChange(table); + + assertThat(testHarness.extractOutputValues()).isEmpty(); + + testHarness.processWatermark(EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + } + + // This should be committed on close + assertDataFiles( + table, rewritten.get(2).group().addedFiles(), rewritten.get(2).group().rewrittenFiles(), 3); + } + + @Test + void testError() throws Exception { + Table table = createPartitionedTable(); + insertPartitioned(table, 1, "p1"); + insertPartitioned(table, 2, "p1"); + insertPartitioned(table, 3, "p2"); + insertPartitioned(table, 4, "p2"); + insertPartitioned(table, 5, "p3"); + insertPartitioned(table, 6, "p3"); + insertPartitioned(table, 7, "p4"); + insertPartitioned(table, 8, "p4"); + + List planned = planDataFileRewrite(tableLoader()); + assertThat(planned).hasSize(4); + List rewritten = executeRewrite(planned); + assertThat(rewritten).hasSize(4); + + try (OneInputStreamOperatorTestHarness + testHarness = harness()) { + testHarness.open(); + + testHarness.processElement(setBatchSizeToTwo(rewritten.get(0)), EVENT_TIME); + assertNoChange(table); + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + + DataFileRewriteRunner.ExecutedGroup group = spy(setBatchSizeToTwo(rewritten.get(1))); + when(group.group()).thenThrow(new RuntimeException("Testing error")); + testHarness.processElement(group, EVENT_TIME); + + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).hasSize(1); + assertThat( + testHarness + .getSideOutput(TaskResultAggregator.ERROR_STREAM) + .poll() + .getValue() + .getMessage()) + .contains("Testing error"); + } + } + + private OneInputStreamOperatorTestHarness harness() + throws Exception { + return new OneInputStreamOperatorTestHarness<>( + new DataFileRewriteCommitter( + OperatorTestBase.DUMMY_TABLE_NAME, + OperatorTestBase.DUMMY_TABLE_NAME, + 0, + tableLoader())); + } + + private static DataFileRewriteRunner.ExecutedGroup setBatchSizeToTwo( + DataFileRewriteRunner.ExecutedGroup from) { + return new DataFileRewriteRunner.ExecutedGroup(from.snapshotId(), 2, from.group()); + } + + // Ensure that the groups are different, so the tests are not accidentally passing + private static void ensureDifferentGroups(List rewritten) { + List resultFiles = + rewritten.stream() + .flatMap(task -> task.group().addedFiles().stream().map(ContentFile::location)) + .collect(Collectors.toList()); + assertThat(resultFiles).hasSize(Set.copyOf(resultFiles).size()); + } + + /** + * Assert that the number of the data files in the table is as expected. Additionally, tests that + * the last commit contains the expected added and removed files. + * + * @param table the table to check + * @param expectedAdded the expected added data files + * @param expectedRemoved the expected removed data files + * @param expectedCurrent the expected current data files count + */ + private static void assertDataFiles( + Table table, + Set expectedAdded, + Set expectedRemoved, + long expectedCurrent) { + table.refresh(); + + assertThat(table.currentSnapshot().summary().get(TOTAL_DATA_FILES)) + .isEqualTo(String.valueOf(expectedCurrent)); + Set actualAdded = Sets.newHashSet(table.currentSnapshot().addedDataFiles(table.io())); + Set actualRemoved = + Sets.newHashSet(table.currentSnapshot().removedDataFiles(table.io())); + assertThat(actualAdded.stream().map(DataFile::location).collect(Collectors.toSet())) + .isEqualTo(expectedAdded.stream().map(DataFile::location).collect(Collectors.toSet())); + assertThat(actualRemoved.stream().map(DataFile::location).collect(Collectors.toSet())) + .isEqualTo(expectedRemoved.stream().map(DataFile::location).collect(Collectors.toSet())); + } + + private static void assertNoChange(Table table) { + long original = table.currentSnapshot().snapshotId(); + table.refresh(); + + assertThat(table.currentSnapshot().snapshotId()).isEqualTo(original); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java new file mode 100644 index 000000000000..2d83f553e576 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java @@ -0,0 +1,210 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.apache.iceberg.actions.SizeBasedFileRewritePlanner.MIN_INPUT_FILES; +import static org.apache.iceberg.flink.maintenance.operator.RewriteUtil.newDataFiles; +import static org.apache.iceberg.flink.maintenance.operator.RewriteUtil.planDataFileRewrite; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.streaming.util.ProcessFunctionTestHarnesses; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileContent; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.Table; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.junit.jupiter.api.Test; + +class TestDataFileRewritePlanner extends OperatorTestBase { + @Test + void testFailsOnV3Table() throws Exception { + Table table = createTable("3"); + Set expected = Sets.newHashSetWithExpectedSize(3); + insert(table, 1, "a"); + expected.addAll(newDataFiles(table)); + + assertThatThrownBy(() -> planDataFileRewrite(tableLoader())) + .hasMessageContaining( + "Flink does not support compaction on row lineage enabled tables (V3+)") + .isInstanceOf(IllegalArgumentException.class); + } + + @Test + void testUnpartitioned() throws Exception { + Set expected = Sets.newHashSetWithExpectedSize(3); + Table table = createTable(); + insert(table, 1, "a"); + expected.addAll(newDataFiles(table)); + insert(table, 2, "b"); + expected.addAll(newDataFiles(table)); + insert(table, 3, "c"); + expected.addAll(newDataFiles(table)); + + List actual = planDataFileRewrite(tableLoader()); + + assertThat(actual).hasSize(1); + assertRewriteFileGroup(actual.get(0), table, expected); + } + + @Test + void testPartitioned() throws Exception { + Set expectedP1 = Sets.newHashSetWithExpectedSize(2); + Set expectedP2 = Sets.newHashSetWithExpectedSize(2); + Table table = createPartitionedTable(); + insertPartitioned(table, 1, "p1"); + expectedP1.addAll(newDataFiles(table)); + insertPartitioned(table, 2, "p1"); + expectedP1.addAll(newDataFiles(table)); + + insertPartitioned(table, 3, "p2"); + expectedP2.addAll(newDataFiles(table)); + insertPartitioned(table, 4, "p2"); + expectedP2.addAll(newDataFiles(table)); + + // This should not participate in compaction, as there is no more files in the partition + insertPartitioned(table, 5, "p3"); + + List actual = planDataFileRewrite(tableLoader()); + + assertThat(actual).hasSize(2); + if (actual.get(0).group().info().partition().get(0, String.class).equals("p1")) { + assertRewriteFileGroup(actual.get(0), table, expectedP1); + assertRewriteFileGroup(actual.get(1), table, expectedP2); + } else { + assertRewriteFileGroup(actual.get(0), table, expectedP2); + assertRewriteFileGroup(actual.get(1), table, expectedP1); + } + } + + @Test + void testError() throws Exception { + Table table = createTable(); + insert(table, 1, "a"); + insert(table, 2, "b"); + + try (OneInputStreamOperatorTestHarness + testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new DataFileRewritePlanner( + OperatorTestBase.DUMMY_TABLE_NAME, + OperatorTestBase.DUMMY_TABLE_NAME, + 0, + tableLoader(), + 11, + 1L, + ImmutableMap.of(MIN_INPUT_FILES, "2"), + Expressions.alwaysTrue()))) { + testHarness.open(); + + // Cause an exception + dropTable(); + + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + trigger(testHarness); + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).hasSize(1); + assertThat( + testHarness + .getSideOutput(TaskResultAggregator.ERROR_STREAM) + .poll() + .getValue() + .getMessage()) + .contains("Table does not exist: "); + } + } + + @Test + void testV2Table() throws Exception { + Table table = createTableWithDelete(); + update(table, 1, null, "a", "b"); + update(table, 1, "b", "c"); + + List actual = planDataFileRewrite(tableLoader()); + + assertThat(actual).hasSize(1); + List tasks = actual.get(0).group().fileScanTasks(); + assertThat(tasks).hasSize(2); + // Find the task with the deletes + FileScanTask withDelete = tasks.get(0).deletes().isEmpty() ? tasks.get(1) : tasks.get(0); + assertThat(withDelete.deletes()).hasSize(2); + assertThat(withDelete.deletes().stream().map(ContentFile::content).collect(Collectors.toList())) + .containsExactlyInAnyOrder(FileContent.POSITION_DELETES, FileContent.EQUALITY_DELETES); + } + + @Test + void testMaxRewriteBytes() throws Exception { + Table table = createPartitionedTable(); + insertPartitioned(table, 1, "p1"); + insertPartitioned(table, 2, "p1"); + insertPartitioned(table, 3, "p2"); + insertPartitioned(table, 4, "p2"); + + // First run with high maxRewriteBytes + List planWithNoMaxRewriteBytes = + planDataFileRewrite(tableLoader()); + assertThat(planWithNoMaxRewriteBytes).hasSize(2); + + // Second run with low maxRewriteBytes, the 2nd group should be removed from the plan + long maxRewriteBytes = + planWithNoMaxRewriteBytes.get(0).group().fileScanTasks().get(0).sizeBytes() + + planWithNoMaxRewriteBytes.get(1).group().fileScanTasks().get(0).sizeBytes() + + 1; + try (OneInputStreamOperatorTestHarness + testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new DataFileRewritePlanner( + OperatorTestBase.DUMMY_TABLE_NAME, + OperatorTestBase.DUMMY_TABLE_NAME, + 0, + tableLoader(), + 11, + maxRewriteBytes, + ImmutableMap.of(MIN_INPUT_FILES, "2"), + Expressions.alwaysTrue()))) { + testHarness.open(); + + OperatorTestBase.trigger(testHarness); + + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + // Only a single group is planned + assertThat(testHarness.extractOutputValues()).hasSize(1); + } + } + + void assertRewriteFileGroup( + DataFileRewritePlanner.PlannedGroup plannedGroup, Table table, Set files) { + assertThat(plannedGroup.table().currentSnapshot().snapshotId()) + .isEqualTo(table.currentSnapshot().snapshotId()); + assertThat(plannedGroup.groupsPerCommit()).isEqualTo(1); + assertThat( + plannedGroup.group().fileScanTasks().stream() + .map(s -> s.file().location()) + .collect(Collectors.toSet())) + .containsExactlyInAnyOrderElementsOf( + files.stream().map(ContentFile::location).collect(Collectors.toList())); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java new file mode 100644 index 000000000000..3c5a10328756 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java @@ -0,0 +1,357 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.apache.iceberg.actions.RewriteDataFiles.TARGET_FILE_SIZE_BYTES; +import static org.apache.iceberg.actions.SizeBasedFileRewritePlanner.MIN_INPUT_FILES; +import static org.apache.iceberg.flink.maintenance.operator.RewriteUtil.executeRewrite; +import static org.apache.iceberg.flink.maintenance.operator.RewriteUtil.planDataFileRewrite; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.util.Collection; +import java.util.List; +import java.util.Set; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.streaming.util.ProcessFunctionTestHarnesses; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileContent; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionData; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.Table; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.parquet.GenericParquetReaders; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +class TestDataFileRewriteRunner extends OperatorTestBase { + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testExecute(boolean partitioned) throws Exception { + Table table; + PartitionData partition; + if (partitioned) { + table = createPartitionedTable(); + partition = new PartitionData(table.spec().partitionType()); + partition.set(0, "p1"); + insertPartitioned(table, 1, "p1"); + insertPartitioned(table, 2, "p1"); + insertPartitioned(table, 3, "p1"); + } else { + table = createTable(); + partition = new PartitionData(PartitionSpec.unpartitioned().partitionType()); + insert(table, 1, "p1"); + insert(table, 2, "p1"); + insert(table, 3, "p1"); + } + + List planned = planDataFileRewrite(tableLoader()); + assertThat(planned).hasSize(1); + List actual = executeRewrite(planned); + assertThat(actual).hasSize(1); + + assertRewriteFileGroup( + actual.get(0), + table, + records( + table.schema(), + ImmutableSet.of( + ImmutableList.of(1, "p1"), ImmutableList.of(2, "p1"), ImmutableList.of(3, "p1"))), + 1, + ImmutableSet.of(partition)); + } + + @Test + void testPartitionSpecChange() throws Exception { + Table table = createPartitionedTable(); + insertPartitioned(table, 1, "p1"); + insertPartitioned(table, 2, "p1"); + PartitionData oldPartition = new PartitionData(table.spec().partitionType()); + oldPartition.set(0, "p1"); + + try (OneInputStreamOperatorTestHarness< + DataFileRewritePlanner.PlannedGroup, DataFileRewriteRunner.ExecutedGroup> + testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new DataFileRewriteRunner( + OperatorTestBase.DUMMY_TABLE_NAME, OperatorTestBase.DUMMY_TABLE_NAME, 0))) { + testHarness.open(); + + List planned = planDataFileRewrite(tableLoader()); + assertThat(planned).hasSize(1); + + testHarness.processElement(planned.get(0), System.currentTimeMillis()); + List actual = testHarness.extractOutputValues(); + assertThat(actual).hasSize(1); + assertRewriteFileGroup( + actual.get(0), + table, + records( + table.schema(), + ImmutableSet.of(ImmutableList.of(1, "p1"), ImmutableList.of(2, "p1"))), + 1, + ImmutableSet.of(oldPartition)); + + insertPartitioned(table, 3, "p1"); + + planned = planDataFileRewrite(tableLoader()); + assertThat(planned).hasSize(1); + + testHarness.processElement(planned.get(0), System.currentTimeMillis()); + actual = testHarness.extractOutputValues(); + assertThat(actual).hasSize(2); + assertRewriteFileGroup( + actual.get(1), + table, + records( + table.schema(), + ImmutableSet.of( + ImmutableList.of(1, "p1"), ImmutableList.of(2, "p1"), ImmutableList.of(3, "p1"))), + 1, + ImmutableSet.of(oldPartition)); + + // Alter the table schema + table.updateSpec().addField("id").commit(); + // Insert some now data + insertFullPartitioned(table, 4, "p1"); + insertFullPartitioned(table, 4, "p1"); + PartitionData newPartition = new PartitionData(table.spec().partitionType()); + newPartition.set(0, "p1"); + newPartition.set(1, 4); + table.refresh(); + + planned = planDataFileRewrite(tableLoader()); + assertThat(planned).hasSize(2); + DataFileRewritePlanner.PlannedGroup oldCompact = planned.get(0); + DataFileRewritePlanner.PlannedGroup newCompact = planned.get(1); + if (oldCompact.group().inputFileNum() == 2) { + newCompact = planned.get(0); + oldCompact = planned.get(1); + } + + testHarness.processElement(newCompact, System.currentTimeMillis()); + actual = testHarness.extractOutputValues(); + assertThat(actual).hasSize(3); + assertRewriteFileGroup( + actual.get(2), + table, + records( + table.schema(), + ImmutableList.of(ImmutableList.of(4, "p1"), ImmutableList.of(4, "p1"))), + 1, + ImmutableSet.of(newPartition)); + + testHarness.processElement(oldCompact, System.currentTimeMillis()); + actual = testHarness.extractOutputValues(); + assertThat(actual).hasSize(4); + PartitionData[] transformedPartitions = { + newPartition.copy(), newPartition.copy(), newPartition.copy() + }; + transformedPartitions[0].set(1, 1); + transformedPartitions[1].set(1, 2); + transformedPartitions[2].set(1, 3); + assertRewriteFileGroup( + actual.get(3), + table, + records( + table.schema(), + ImmutableSet.of( + ImmutableList.of(1, "p1"), ImmutableList.of(2, "p1"), ImmutableList.of(3, "p1"))), + 3, + Sets.newHashSet(transformedPartitions)); + } + } + + @Test + void testError() throws Exception { + Table table = createTable(); + insert(table, 1, "a"); + insert(table, 2, "b"); + + try (OneInputStreamOperatorTestHarness< + DataFileRewritePlanner.PlannedGroup, DataFileRewriteRunner.ExecutedGroup> + testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new DataFileRewriteRunner( + OperatorTestBase.DUMMY_TABLE_NAME, OperatorTestBase.DUMMY_TABLE_NAME, 0))) { + testHarness.open(); + + List planned = planDataFileRewrite(tableLoader()); + assertThat(planned).hasSize(1); + // Cause an exception + dropTable(); + + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + testHarness.processElement(planned.get(0), System.currentTimeMillis()); + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).hasSize(1); + assertThat( + testHarness + .getSideOutput(TaskResultAggregator.ERROR_STREAM) + .poll() + .getValue() + .getMessage()) + .contains("File does not exist: "); + } + } + + @Test + void testV2Table() throws Exception { + Table table = createTableWithDelete(); + update(table, 1, null, "a", "b"); + update(table, 1, "b", "c"); + + List planned = planDataFileRewrite(tableLoader()); + assertThat(planned).hasSize(1); + + List actual = executeRewrite(planned); + assertThat(actual).hasSize(1); + + assertRewriteFileGroup( + actual.get(0), + table, + records(table.schema(), ImmutableSet.of(ImmutableList.of(1, "c"))), + 1, + ImmutableSet.of(new PartitionData(PartitionSpec.unpartitioned().partitionType()))); + } + + @Test + void testSplitSize() throws Exception { + Table table = createTable(); + + File dataDir = new File(new Path(table.location(), "data").toUri().getPath()); + dataDir.mkdir(); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(table, FileFormat.PARQUET, dataDir.toPath()); + List expected = Lists.newArrayListWithExpectedSize(4000); + for (int i = 0; i < 4; ++i) { + List batch = RandomGenericData.generate(table.schema(), 1000, 10 + i); + dataAppender.appendToTable(batch); + expected.addAll(batch); + } + + // First run with high target file size + List planWithNoTargetFileSize = + planDataFileRewrite(tableLoader()); + assertThat(planWithNoTargetFileSize).hasSize(1); + + // Second run with low target file size + long targetFileSize = + planWithNoTargetFileSize.get(0).group().fileScanTasks().get(0).sizeBytes() + + planWithNoTargetFileSize.get(0).group().fileScanTasks().get(1).sizeBytes(); + List planned; + try (OneInputStreamOperatorTestHarness + testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new DataFileRewritePlanner( + OperatorTestBase.DUMMY_TABLE_NAME, + OperatorTestBase.DUMMY_TABLE_NAME, + 0, + tableLoader(), + 11, + 10_000_000, + ImmutableMap.of( + MIN_INPUT_FILES, + "2", + TARGET_FILE_SIZE_BYTES, + String.valueOf(targetFileSize)), + Expressions.alwaysTrue()))) { + testHarness.open(); + + OperatorTestBase.trigger(testHarness); + + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + planned = testHarness.extractOutputValues(); + assertThat(planned).hasSize(1); + } + + List actual = executeRewrite(planned); + assertThat(actual).hasSize(1); + + assertRewriteFileGroup( + actual.get(0), + table, + expected, + 2, + ImmutableSet.of(new PartitionData(PartitionSpec.unpartitioned().partitionType()))); + } + + void assertRewriteFileGroup( + DataFileRewriteRunner.ExecutedGroup actual, + Table table, + Collection expectedRecords, + int expectedFileNum, + Set expectedPartitions) + throws IOException { + assertThat(actual.snapshotId()).isEqualTo(table.currentSnapshot().snapshotId()); + assertThat(actual.groupsPerCommit()).isEqualTo(1); + assertThat(actual.group().addedFiles()).hasSize(expectedFileNum); + Collection writtenRecords = Lists.newArrayListWithExpectedSize(expectedRecords.size()); + Set writtenPartitions = Sets.newHashSetWithExpectedSize(expectedPartitions.size()); + for (DataFile newDataFile : actual.group().addedFiles()) { + assertThat(newDataFile.format()).isEqualTo(FileFormat.PARQUET); + assertThat(newDataFile.content()).isEqualTo(FileContent.DATA); + assertThat(newDataFile.keyMetadata()).isNull(); + writtenPartitions.add(newDataFile.partition()); + + try (CloseableIterable reader = + Parquet.read(table.io().newInputFile(newDataFile.location())) + .project(table.schema()) + .createReaderFunc( + fileSchema -> GenericParquetReaders.buildReader(table.schema(), fileSchema)) + .build()) { + List newRecords = Lists.newArrayList(reader); + assertThat(newRecords).hasSize((int) newDataFile.recordCount()); + writtenRecords.addAll(newRecords); + } + } + + assertThat(writtenRecords).containsExactlyInAnyOrderElementsOf(expectedRecords); + assertThat(writtenPartitions).isEqualTo(expectedPartitions); + } + + private List records(Schema schema, Collection> data) { + GenericRecord record = GenericRecord.create(schema); + + ImmutableList.Builder builder = ImmutableList.builder(); + data.forEach( + recordData -> + builder.add( + record.copy(ImmutableMap.of("id", recordData.get(0), "data", recordData.get(1))))); + + return builder.build(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDeleteFilesProcessor.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDeleteFilesProcessor.java new file mode 100644 index 000000000000..7511e1029b6f --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDeleteFilesProcessor.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.FileSystems; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Set; +import java.util.stream.Collectors; +import org.apache.flink.api.common.typeutils.base.StringSerializer; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +class TestDeleteFilesProcessor extends OperatorTestBase { + private static final String DUMMY_FILE_NAME = "dummy"; + private static final Set TABLE_FILES = + ImmutableSet.of( + "metadata/v1.metadata.json", + "metadata/version-hint.text", + "metadata/.version-hint.text.crc", + "metadata/.v1.metadata.json.crc"); + + private Table table; + + @BeforeEach + void before() { + this.table = createTable(); + } + + @Test + void testDelete() throws Exception { + // Write an extra file + Path dummyFile = Path.of(tablePath(table).toString(), DUMMY_FILE_NAME); + Files.write(dummyFile, "DUMMY".getBytes(StandardCharsets.UTF_8)); + + Set files = listFiles(table); + assertThat(files) + .containsAll(TABLE_FILES) + .contains(DUMMY_FILE_NAME) + .hasSize(TABLE_FILES.size() + 1); + + deleteFile(tableLoader(), dummyFile.toString()); + + assertThat(listFiles(table)).isEqualTo(TABLE_FILES); + } + + @Test + void testDeleteMissingFile() throws Exception { + Path dummyFile = + FileSystems.getDefault().getPath(table.location().substring(5), DUMMY_FILE_NAME); + + deleteFile(tableLoader(), dummyFile.toString()); + + assertThat(listFiles(table)).isEqualTo(TABLE_FILES); + } + + @Test + void testInvalidURIScheme() throws Exception { + deleteFile(tableLoader(), "wrong://"); + + assertThat(listFiles(table)).isEqualTo(TABLE_FILES); + } + + private void deleteFile(TableLoader tableLoader, String fileName) throws Exception { + tableLoader().open(); + try (OneInputStreamOperatorTestHarness testHarness = + new OneInputStreamOperatorTestHarness<>( + new DeleteFilesProcessor(table, DUMMY_TASK_NAME, 0, 10), StringSerializer.INSTANCE)) { + testHarness.open(); + testHarness.processElement(fileName, System.currentTimeMillis()); + testHarness.processWatermark(EVENT_TIME); + testHarness.endInput(); + } + } + + private static Path tablePath(Table table) { + return FileSystems.getDefault().getPath(table.location().substring(5)); + } + + private static Set listFiles(Table table) throws IOException { + String tableRootPath = TestFixtures.TABLE_IDENTIFIER.toString().replace(".", "/"); + return Files.find( + tablePath(table), Integer.MAX_VALUE, (filePath, fileAttr) -> fileAttr.isRegularFile()) + .map( + p -> + p.toString() + .substring(p.toString().indexOf(tableRootPath) + tableRootPath.length() + 1)) + .collect(Collectors.toSet()); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestExpireSnapshotsProcessor.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestExpireSnapshotsProcessor.java new file mode 100644 index 000000000000..f073272a70b7 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestExpireSnapshotsProcessor.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.Queue; +import java.util.Set; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.streaming.util.ProcessFunctionTestHarnesses; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.maintenance.api.TaskResult; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +class TestExpireSnapshotsProcessor extends OperatorTestBase { + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testExpire(boolean success) throws Exception { + Table table = createTable(); + insert(table, 1, "a"); + insert(table, 2, "b"); + + List actual; + Queue> deletes; + try (OneInputStreamOperatorTestHarness testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new ExpireSnapshotsProcessor(tableLoader(), 0L, 1, 10, false))) { + testHarness.open(); + + if (!success) { + // Cause an exception + dropTable(); + } + + testHarness.processElement(Trigger.create(10, 11), System.currentTimeMillis()); + deletes = testHarness.getSideOutput(ExpireSnapshotsProcessor.DELETE_STREAM); + actual = testHarness.extractOutputValues(); + } + + assertThat(actual).hasSize(1); + TaskResult result = actual.get(0); + assertThat(result.startEpoch()).isEqualTo(10); + assertThat(result.taskIndex()).isEqualTo(11); + assertThat(result.success()).isEqualTo(success); + + if (success) { + assertThat(result.exceptions()).isNotNull().isEmpty(); + + table.refresh(); + Set snapshots = Sets.newHashSet(table.snapshots()); + assertThat(snapshots).hasSize(1); + assertThat(deletes).hasSize(1); + } else { + assertThat(result.exceptions()).isNotNull().hasSize(1); + assertThat(deletes).isNull(); + } + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testCleanExpiredMetadata(boolean cleanExpiredMetadata) throws Exception { + Table table = createTable(); + insert(table, 1, "a"); + table.updateSchema().addColumn("extra", Types.StringType.get()).commit(); + insert(table, 2, "b", "x"); + + assertThat(table.schemas()).hasSize(2); + + List actual; + Queue> deletes; + try (OneInputStreamOperatorTestHarness testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new ExpireSnapshotsProcessor(tableLoader(), 0L, 1, 10, cleanExpiredMetadata))) { + testHarness.open(); + + testHarness.processElement(Trigger.create(10, 11), System.currentTimeMillis()); + deletes = testHarness.getSideOutput(ExpireSnapshotsProcessor.DELETE_STREAM); + actual = testHarness.extractOutputValues(); + } + + assertThat(actual).hasSize(1); + TaskResult result = actual.get(0); + assertThat(result.startEpoch()).isEqualTo(10); + assertThat(result.taskIndex()).isEqualTo(11); + assertThat(result.success()).isEqualTo(true); + assertThat(result.exceptions()).isNotNull().isEmpty(); + + table.refresh(); + Set snapshots = Sets.newHashSet(table.snapshots()); + assertThat(snapshots).hasSize(1); + assertThat(deletes).hasSize(1); + + if (cleanExpiredMetadata) { + assertThat(table.schemas().values()).containsExactly(table.schema()); + } else { + assertThat(table.schemas()).hasSize(2); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListFileSystemFiles.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListFileSystemFiles.java new file mode 100644 index 000000000000..12478bb33fb2 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListFileSystemFiles.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.streaming.util.ProcessFunctionTestHarnesses; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +class TestListFileSystemFiles extends OperatorTestBase { + @Parameter(index = 0) + private boolean usePrefixListing; + + @Parameters(name = "usePrefixListing = {0}") + private static Object[][] parameters() { + return new Object[][] {{true}, {false}}; + } + + @TestTemplate + void testMetadataFilesWithTable() throws Exception { + Table table = createTable(); + insert(table, 1, "a"); + insert(table, 2, "b"); + insert(table, 3, "c"); + try (OneInputStreamOperatorTestHarness testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new ListFileSystemFiles( + OperatorTestBase.DUMMY_TABLE_NAME, + 0, + tableLoader(), + table.location(), + 0, + usePrefixListing))) { + testHarness.open(); + OperatorTestBase.trigger(testHarness); + + assertThat(testHarness.extractOutputValues()).hasSize(11); + assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); + } + } + + @TestTemplate + void testMetadataFilesWithPartitionTable() throws Exception { + Table table = createPartitionedTable(); + insertPartitioned(table, 1, "p1"); + insertPartitioned(table, 2, "p1"); + insertPartitioned(table, 3, "p2"); + insertPartitioned(table, 4, "p2"); + try (OneInputStreamOperatorTestHarness testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new ListFileSystemFiles( + OperatorTestBase.DUMMY_TABLE_NAME, + 0, + tableLoader(), + table.location(), + 0, + usePrefixListing))) { + testHarness.open(); + OperatorTestBase.trigger(testHarness); + + assertThat(testHarness.extractOutputValues()).hasSize(14); + assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); + } + } + + @TestTemplate + void testMetadataFilesWithEmptyTable() throws Exception { + Table table = createTable(); + try (OneInputStreamOperatorTestHarness testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new ListFileSystemFiles( + OperatorTestBase.DUMMY_TABLE_NAME, + 0, + tableLoader(), + table.location(), + 0, + usePrefixListing))) { + testHarness.open(); + OperatorTestBase.trigger(testHarness); + + assertThat(testHarness.extractOutputValues()).hasSize(2); + assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListMetadataFiles.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListMetadataFiles.java new file mode 100644 index 000000000000..bb8c74f3d5e9 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListMetadataFiles.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.streaming.util.ProcessFunctionTestHarnesses; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.junit.jupiter.api.Test; + +class TestListMetadataFiles extends OperatorTestBase { + + @Test + void testMetadataFilesWithTable() throws Exception { + Table table = createTable(); + insert(table, 1, "a"); + insert(table, 2, "b"); + insert(table, 3, "c"); + try (OneInputStreamOperatorTestHarness testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new ListMetadataFiles(OperatorTestBase.DUMMY_TABLE_NAME, 0, tableLoader()))) { + testHarness.open(); + + OperatorTestBase.trigger(testHarness); + + List tableMetadataFiles = testHarness.extractOutputValues(); + tableMetadataFiles.forEach(System.out::println); + assertThat(tableMetadataFiles).hasSize(24); + + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + } + } + + @Test + void testMetadataFilesWithPartitionTable() throws Exception { + Table table = createPartitionedTable(); + insertPartitioned(table, 1, "p1"); + insertPartitioned(table, 2, "p1"); + insertPartitioned(table, 3, "p2"); + insertPartitioned(table, 4, "p2"); + try (OneInputStreamOperatorTestHarness testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new ListMetadataFiles(OperatorTestBase.DUMMY_TABLE_NAME, 0, tableLoader()))) { + testHarness.open(); + + OperatorTestBase.trigger(testHarness); + + List tableMetadataFiles = testHarness.extractOutputValues(); + assertThat(tableMetadataFiles).hasSize(38); + + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + } + } + + @Test + void testMetadataFilesWithEmptyTable() throws Exception { + createTable(); + try (OneInputStreamOperatorTestHarness testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new ListMetadataFiles(OperatorTestBase.DUMMY_TABLE_NAME, 0, tableLoader()))) { + testHarness.open(); + + OperatorTestBase.trigger(testHarness); + + List tableMetadataFiles = testHarness.extractOutputValues(); + assertThat(tableMetadataFiles).hasSize(0); + + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockConfig.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockConfig.java new file mode 100644 index 000000000000..cec76019ae10 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockConfig.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Map; +import org.apache.flink.configuration.Configuration; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.maintenance.api.LockConfig; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +public class TestLockConfig extends OperatorTestBase { + private static final String TABLE_NAME = "catalog.db.table"; + private static final String LOCK_ID = "test-lock-id"; + private Map input = Maps.newHashMap(); + private Table table; + + @BeforeEach + public void before() { + input.put("flink-maintenance.lock.type", "jdbc"); + input.put("flink-maintenance.lock.lock-id", LOCK_ID); + input.put("other.config", "should-be-ignored"); + this.table = createTable(); + } + + @AfterEach + public void after() { + input.clear(); + } + + @Test + void testConfigParsing() { + LockConfig config = new LockConfig(table, input, new Configuration()); + + assertThat(config.lockType()).isEqualTo("jdbc"); + assertThat(config.lockId(LOCK_ID)).isEqualTo(LOCK_ID); + } + + @Test + void testEmptyConfig() { + LockConfig config = new LockConfig(table, Maps.newHashMap(), new Configuration()); + + assertThat(config.lockType()).isEmpty(); + assertThat(config.lockId(TABLE_NAME)).isEqualTo(TABLE_NAME); + } + + @Test + void testWriteOptionReplaceSetConfig() { + Configuration configuration = new Configuration(); + configuration.setString("flink-maintenance.lock.type", "zk"); + configuration.setString("flink-maintenance.lock.replace-item", "test-config"); + configuration.setString("flink-maintenance.lock.jdbc.init-lock-table", "true"); + LockConfig config = new LockConfig(table, input, configuration); + + // set config should be ignored + assertThat(config.lockType()).isEqualTo("jdbc"); + assertThat(config.jdbcInitTable()).isEqualTo("true"); + + assertThat(config.properties()) + .doesNotContainKey("other.config") + .containsEntry("type", "jdbc") + .containsEntry("replace-item", "test-config"); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockFactoryBuilder.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockFactoryBuilder.java new file mode 100644 index 000000000000..d32d5f840c4b --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockFactoryBuilder.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.util.Map; +import org.apache.curator.test.TestingServer; +import org.apache.flink.configuration.Configuration; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.maintenance.api.LockConfig; +import org.apache.iceberg.flink.maintenance.api.TriggerLockFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +class TestLockFactoryBuilder extends OperatorTestBase { + private static final String TABLE_NAME = "catalog.db.table"; + + private TestingServer zkTestServer; + private Table table; + + @BeforeEach + void before() { + this.table = createTable(); + try { + zkTestServer = new TestingServer(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + @AfterEach + public void after() throws IOException { + if (zkTestServer != null) { + zkTestServer.close(); + } + } + + @Test + void testJdbcBuildWithMissingJdbcUri() { + Map config = Maps.newHashMap(); + config.put(LockConfig.LOCK_TYPE_OPTION.key(), LockConfig.JdbcLockConfig.JDBC); + LockConfig lockConfig = new LockConfig(table, config, new Configuration()); + + assertThatThrownBy(() -> LockFactoryBuilder.build(lockConfig, TABLE_NAME)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining( + String.format( + "JDBC lock requires %s parameter", + LockConfig.JdbcLockConfig.JDBC_URI_OPTION.key())); + } + + @Test + void testJdbcBuildSuccessfully() { + Map config = Maps.newHashMap(); + config.put(LockConfig.LOCK_TYPE_OPTION.key(), LockConfig.JdbcLockConfig.JDBC); + config.put(LockConfig.JdbcLockConfig.JDBC_URI_OPTION.key(), "jdbc:sqlite:file::memory:?ic"); + config.put(LockConfig.LOCK_ID_OPTION.key(), "test-lock-id"); + LockConfig lockConfig = new LockConfig(table, config, new Configuration()); + + TriggerLockFactory factory = LockFactoryBuilder.build(lockConfig, TABLE_NAME); + assertThat(factory).isNotNull(); + } + + @Test + void testZkBuildWithMissingUri() { + Map config = Maps.newHashMap(); + config.put(LockConfig.LOCK_TYPE_OPTION.key(), LockConfig.ZkLockConfig.ZK); + LockConfig lockConfig = new LockConfig(table, config, new Configuration()); + + assertThatThrownBy(() -> LockFactoryBuilder.build(lockConfig, TABLE_NAME)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining( + String.format( + "Zk lock requires %s parameter", LockConfig.ZkLockConfig.ZK_URI_OPTION.key())); + } + + @Test + void testZkBuildSuccessfully() { + Map config = Maps.newHashMap(); + config.put(LockConfig.LOCK_TYPE_OPTION.key(), LockConfig.ZkLockConfig.ZK); + config.put(LockConfig.ZkLockConfig.ZK_URI_OPTION.key(), zkTestServer.getConnectString()); + config.put(LockConfig.LOCK_ID_OPTION.key(), "test-lock-id"); + LockConfig lockConfig = new LockConfig(table, config, new Configuration()); + + TriggerLockFactory factory = LockFactoryBuilder.build(lockConfig, TABLE_NAME); + assertThat(factory).isNotNull(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockRemover.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockRemover.java new file mode 100644 index 000000000000..7b88f20e376a --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockRemover.java @@ -0,0 +1,444 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.FAILED_TASK_COUNTER; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.LAST_RUN_DURATION_MS; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.SUCCEEDED_TASK_COUNTER; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.util.Collection; +import java.util.List; +import org.apache.flink.api.common.functions.FlatMapFunction; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.sink2.Committer; +import org.apache.flink.api.connector.sink2.CommitterInitContext; +import org.apache.flink.api.connector.sink2.CommittingSinkWriter; +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.sink2.SinkWriter; +import org.apache.flink.api.connector.sink2.SupportsCommitter; +import org.apache.flink.api.connector.sink2.WriterInitContext; +import org.apache.flink.configuration.CheckpointingOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; +import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; +import org.apache.flink.streaming.api.connector.sink2.SupportsPostCommitTopology; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.util.Collector; +import org.apache.iceberg.flink.maintenance.api.TaskResult; +import org.apache.iceberg.flink.maintenance.api.TriggerLockFactory; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.api.io.TempDir; + +@Timeout(value = 10) +class TestLockRemover extends OperatorTestBase { + private static final String[] TASKS = new String[] {"task0", "task1", "task2"}; + private static final TriggerLockFactory.Lock LOCK = new TestingLock(); + private static final TriggerLockFactory.Lock RECOVERY_LOCK = new TestingLock(); + + @TempDir private File checkpointDir; + + @BeforeEach + void before() { + MetricsReporterFactoryForTests.reset(); + } + + @Test + void testProcess() throws Exception { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + ManualSource source = new ManualSource<>(env, TypeInformation.of(TaskResult.class)); + source + .dataStream() + .transform( + DUMMY_TASK_NAME, + TypeInformation.of(Void.class), + new LockRemover(DUMMY_TABLE_NAME, new TestingLockFactory(), Lists.newArrayList(TASKS))) + .setParallelism(1); + + JobClient jobClient = null; + try { + jobClient = env.executeAsync(); + + LOCK.tryLock(); + assertThat(LOCK.isHeld()).isTrue(); + + // Start a successful trigger for task1 and assert the return value is correct + processAndCheck(source, new TaskResult(0, 0L, true, Lists.newArrayList())); + + // Assert that the lock is removed + assertThat(LOCK.isHeld()).isFalse(); + } finally { + closeJobClient(jobClient); + } + } + + @Test + void testInSink() throws Exception { + String sinkName = "TestSink"; + Configuration config = new Configuration(); + config.set(CheckpointingOptions.CHECKPOINT_STORAGE, "filesystem"); + config.set(CheckpointingOptions.CHECKPOINTS_DIRECTORY, "file://" + checkpointDir.getPath()); + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(config); + env.enableCheckpointing(10); + ManualSource source = new ManualSource<>(env, TypeInformation.of(TaskResult.class)); + source.dataStream().global().sinkTo(new SinkTest()).name(sinkName).setParallelism(1); + + JobClient jobClient = null; + try { + jobClient = env.executeAsync(); + + LOCK.tryLock(); + assertThat(LOCK.isHeld()).isTrue(); + + // Start a successful trigger for task1 and assert the return value is correct + processAndCheck(source, new TaskResult(0, 0L, true, Lists.newArrayList()), sinkName + ": "); + + // Assert that the lock is removed + assertThat(LOCK.isHeld()).isFalse(); + } finally { + closeJobClient(jobClient); + } + } + + @Test + void testMetrics() throws Exception { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + ManualSource source = new ManualSource<>(env, TypeInformation.of(TaskResult.class)); + source + .dataStream() + .transform( + DUMMY_TASK_NAME, + TypeInformation.of(Void.class), + new LockRemover(DUMMY_TABLE_NAME, new TestingLockFactory(), Lists.newArrayList(TASKS))) + .setParallelism(1); + + JobClient jobClient = null; + long time = System.currentTimeMillis(); + try { + jobClient = env.executeAsync(); + // Start the 2 successful and one failed result trigger for task1, and 3 successful for task2 + processAndCheck(source, new TaskResult(0, time, true, Lists.newArrayList())); + processAndCheck(source, new TaskResult(1, 0L, true, Lists.newArrayList())); + processAndCheck(source, new TaskResult(1, 0L, true, Lists.newArrayList())); + processAndCheck(source, new TaskResult(0, time, false, Lists.newArrayList())); + processAndCheck(source, new TaskResult(0, time, true, Lists.newArrayList())); + processAndCheck(source, new TaskResult(1, 0L, true, Lists.newArrayList())); + + Awaitility.await() + .until( + () -> + MetricsReporterFactoryForTests.counter( + ImmutableList.of( + DUMMY_TASK_NAME, + DUMMY_TABLE_NAME, + TASKS[1], + "1", + SUCCEEDED_TASK_COUNTER)) + .equals(3L)); + + // Final check all the counters + MetricsReporterFactoryForTests.assertCounters( + new ImmutableMap.Builder, Long>() + .put( + ImmutableList.of( + DUMMY_TASK_NAME, DUMMY_TABLE_NAME, TASKS[0], "0", SUCCEEDED_TASK_COUNTER), + 2L) + .put( + ImmutableList.of( + DUMMY_TASK_NAME, DUMMY_TABLE_NAME, TASKS[0], "0", FAILED_TASK_COUNTER), + 1L) + .put( + ImmutableList.of( + DUMMY_TASK_NAME, DUMMY_TABLE_NAME, TASKS[1], "1", SUCCEEDED_TASK_COUNTER), + 3L) + .put( + ImmutableList.of( + DUMMY_TASK_NAME, DUMMY_TABLE_NAME, TASKS[1], "1", FAILED_TASK_COUNTER), + 0L) + .put( + ImmutableList.of( + DUMMY_TASK_NAME, DUMMY_TABLE_NAME, TASKS[2], "2", SUCCEEDED_TASK_COUNTER), + 0L) + .put( + ImmutableList.of( + DUMMY_TASK_NAME, DUMMY_TABLE_NAME, TASKS[2], "2", FAILED_TASK_COUNTER), + 0L) + .build()); + + assertThat( + MetricsReporterFactoryForTests.gauge( + ImmutableList.of( + DUMMY_TASK_NAME, DUMMY_TABLE_NAME, TASKS[0], "0", LAST_RUN_DURATION_MS))) + .isPositive(); + assertThat( + MetricsReporterFactoryForTests.gauge( + ImmutableList.of( + DUMMY_TASK_NAME, DUMMY_TABLE_NAME, TASKS[1], "1", LAST_RUN_DURATION_MS))) + .isGreaterThan(time); + assertThat( + MetricsReporterFactoryForTests.gauge( + ImmutableList.of( + DUMMY_TASK_NAME, DUMMY_TABLE_NAME, TASKS[2], "2", LAST_RUN_DURATION_MS))) + .isZero(); + } finally { + closeJobClient(jobClient); + } + } + + /** + * The test checks if the recovery watermark is only removed if the watermark has arrived from + * both upstream sources. + * + * @throws Exception if any + */ + @Test + void testRecovery() throws Exception { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + ManualSource source1 = + new ManualSource<>(env, TypeInformation.of(TaskResult.class)); + ManualSource source2 = + new ManualSource<>(env, TypeInformation.of(TaskResult.class)); + source1 + .dataStream() + .union(source2.dataStream()) + .transform( + DUMMY_TASK_NAME, + TypeInformation.of(Void.class), + new LockRemover( + DUMMY_TABLE_NAME, new TestingLockFactory(), Lists.newArrayList(TASKS[0]))) + .setParallelism(1); + + JobClient jobClient = null; + try { + jobClient = env.executeAsync(); + RECOVERY_LOCK.tryLock(); + assertThat(RECOVERY_LOCK.isHeld()).isTrue(); + + processAndCheck(source1, new TaskResult(0, 0L, true, Lists.newArrayList())); + + source1.sendRecord(new TaskResult(0, 1L, true, Lists.newArrayList())); + // we receive the second result - this will not happen in real use cases, but with this we can + // be sure that the previous watermark is processed + Awaitility.await() + .until( + () -> + MetricsReporterFactoryForTests.counter( + ImmutableList.of( + DUMMY_TASK_NAME, + DUMMY_TABLE_NAME, + TASKS[0], + "0", + SUCCEEDED_TASK_COUNTER)) + .equals(2L)); + + // We did not remove the recovery lock, as no watermark received from the other source + assertThat(RECOVERY_LOCK.isHeld()).isTrue(); + + // Recovery arrives + source1.sendWatermark(10L); + source2.sendWatermark(10L); + + Awaitility.await().until(() -> !RECOVERY_LOCK.isHeld()); + } finally { + closeJobClient(jobClient); + } + } + + private void processAndCheck(ManualSource source, TaskResult input) { + processAndCheck(source, input, null); + } + + private void processAndCheck( + ManualSource source, TaskResult input, String counterPrefix) { + List counterKey = + ImmutableList.of( + (counterPrefix != null ? counterPrefix : "") + DUMMY_TASK_NAME, + DUMMY_TABLE_NAME, + TASKS[input.taskIndex()], + String.valueOf(input.taskIndex()), + input.success() ? SUCCEEDED_TASK_COUNTER : FAILED_TASK_COUNTER); + Long counterValue = MetricsReporterFactoryForTests.counter(counterKey); + Long expected = counterValue != null ? counterValue + 1 : 1L; + + source.sendRecord(input); + source.sendWatermark(input.startEpoch()); + + Awaitility.await() + .until(() -> expected.equals(MetricsReporterFactoryForTests.counter(counterKey))); + } + + private static class TestingLockFactory implements TriggerLockFactory { + + private boolean open = false; + + @Override + public void open() { + open = true; + } + + @Override + public Lock createLock() { + if (!open) { + throw new IllegalStateException("Lock factory not open"); + } + + return LOCK; + } + + @Override + public Lock createRecoveryLock() { + if (!open) { + throw new IllegalStateException("Lock factory not open"); + } + + return RECOVERY_LOCK; + } + + @Override + public void close() { + open = false; + } + } + + private static class TestingLock implements TriggerLockFactory.Lock { + private boolean locked = false; + + @Override + public boolean tryLock() { + if (isHeld()) { + return false; + } else { + locked = true; + return true; + } + } + + @Override + public boolean isHeld() { + return locked; + } + + @Override + public void unlock() { + locked = false; + } + } + + private static class SinkTest + implements Sink, + SupportsCommitter, + SupportsPostCommitTopology { + @Override + public SinkWriter createWriter(WriterInitContext initContext) { + return new CommittingSinkWriter() { + private final Collection received = Lists.newArrayList(); + + @Override + public Collection prepareCommit() { + Collection result = Lists.newArrayList(received); + received.clear(); + return result; + } + + @Override + public void write(TaskResult taskResult, Context context) { + received.add(taskResult); + } + + @Override + public void flush(boolean b) { + // noop + } + + @Override + public void close() { + // noop + } + }; + } + + @Override + public Committer createCommitter(CommitterInitContext committerInitContext) { + return new Committer<>() { + @Override + public void commit(Collection> collection) { + // noop + } + + @Override + public void close() { + // noop + } + }; + } + + @Override + public SimpleVersionedSerializer getCommittableSerializer() { + return new SimpleVersionedSerializer<>() { + @Override + public int getVersion() { + return 0; + } + + @Override + public byte[] serialize(TaskResult taskResult) { + return new byte[0]; + } + + @Override + public TaskResult deserialize(int i, byte[] bytes) { + return null; + } + }; + } + + @Override + public void addPostCommitTopology(DataStream> committables) { + committables + .flatMap( + new FlatMapFunction, TaskResult>() { + @Override + public void flatMap( + CommittableMessage taskResultCommittableMessage, + Collector collector) { + if (taskResultCommittableMessage instanceof CommittableWithLineage) { + collector.collect( + ((CommittableWithLineage) taskResultCommittableMessage) + .getCommittable()); + } + } + }) + .transform( + DUMMY_TASK_NAME, + TypeInformation.of(Void.class), + new LockRemover( + DUMMY_TABLE_NAME, new TestingLockFactory(), Lists.newArrayList(TASKS[0]))); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java new file mode 100644 index 000000000000..9c1ea2f2295a --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java @@ -0,0 +1,355 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.File; +import java.io.IOException; +import java.time.Duration; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.atomic.AtomicReference; +import org.apache.flink.api.common.JobID; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy; +import org.apache.flink.client.program.ClusterClient; +import org.apache.flink.configuration.CheckpointingOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.runtime.client.JobExecutionException; +import org.apache.flink.runtime.jobgraph.SavepointRestoreSettings; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.graph.StreamGraph; +import org.apache.flink.test.junit5.InjectClusterClient; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.RewriteFiles; +import org.apache.iceberg.Table; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +class TestMonitorSource extends OperatorTestBase { + private static final TableChange EMPTY_EVENT = TableChange.empty(); + private static final RateLimiterStrategy HIGH_RATE = RateLimiterStrategy.perSecond(100.0); + private static final RateLimiterStrategy LOW_RATE = RateLimiterStrategy.perSecond(1.0 / 10000.0); + + @TempDir private File checkpointDir; + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testChangeReaderIterator(boolean withDelete) throws IOException { + Table table = withDelete ? createTableWithDelete() : createTable(); + + MonitorSource.TableChangeIterator iterator = + new MonitorSource.TableChangeIterator(tableLoader(), null, Long.MAX_VALUE); + + // For an empty table we get an empty result + assertThat(iterator.next()).isEqualTo(EMPTY_EVENT); + + // Add a single commit and get back the commit data in the event + insert(table, 1, "a"); + TableChange expected = tableChangeWithLastSnapshot(table, TableChange.empty()); + assertThat(iterator.next()).isEqualTo(expected); + // Make sure that consecutive calls do not return the data again + assertThat(iterator.next()).isEqualTo(EMPTY_EVENT); + + // Add two more commits, but fetch the data in one loop + insert(table, 2, "b"); + expected = tableChangeWithLastSnapshot(table, TableChange.empty()); + + insert(table, 3, "c"); + expected = tableChangeWithLastSnapshot(table, expected); + + assertThat(iterator.next()).isEqualTo(expected); + // Make sure that consecutive calls do not return the data again + assertThat(iterator.next()).isEqualTo(EMPTY_EVENT); + } + + /** + * Create a table and check that the source returns the data as new commits arrive to the table. + */ + @Test + void testSource() throws Exception { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + Table table = createTable(); + DataStream events = + env.fromSource( + new MonitorSource(tableLoader(), HIGH_RATE, Long.MAX_VALUE), + WatermarkStrategy.noWatermarks(), + "TableChangeSource") + .forceNonParallel(); + + // Sink to collect the results + CollectingSink result = new CollectingSink<>(); + events.sinkTo(result); + + JobClient jobClient = null; + try { + // First result is an empty event + jobClient = env.executeAsync("Table Change Source Test"); + assertThat(result.poll(Duration.ofSeconds(5L))).isEqualTo(EMPTY_EVENT); + + // Insert some data + File dataDir = new File(new Path(table.location(), "data").toUri().getPath()); + dataDir.mkdir(); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(table, FileFormat.PARQUET, dataDir.toPath()); + List batch1 = RandomGenericData.generate(table.schema(), 2, 1); + dataAppender.appendToTable(batch1); + + // Wait until the changes are committed + Awaitility.await() + .until( + () -> { + table.refresh(); + return table.currentSnapshot() != null; + }); + + table.refresh(); + long size = firstFileLength(table); + + // Wait until the first non-empty event has arrived, and check the expected result + Awaitility.await() + .until( + () -> { + TableChange newEvent = result.poll(Duration.ofSeconds(5L)); + // Fetch every empty event from the beginning + while (newEvent.equals(EMPTY_EVENT)) { + newEvent = result.poll(Duration.ofSeconds(5L)); + } + + // The first non-empty event should contain the expected value + return newEvent.equals( + TableChange.builder() + .dataFileCount(1) + .dataFileSizeInBytes(size) + .commitCount(1) + .build()); + }); + } finally { + closeJobClient(jobClient); + } + } + + /** Check that the {@link MonitorSource} operator state is restored correctly. */ + @Test + void testStateRestore( + @TempDir File savepointDir, @InjectClusterClient ClusterClient clusterClient) + throws Exception { + Table table = createTable(); + insert(table, 1, "a"); + TableLoader tableLoader = tableLoader(); + + Configuration config = new Configuration(); + config.set(CheckpointingOptions.CHECKPOINT_STORAGE, "filesystem"); + config.set(CheckpointingOptions.CHECKPOINTS_DIRECTORY, "file://" + checkpointDir.getPath()); + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(config); + env.enableCheckpointing(1000); + + DataStream events = + env.fromSource( + new MonitorSource(tableLoader, HIGH_RATE, Long.MAX_VALUE), + WatermarkStrategy.noWatermarks(), + "TableChangeSource") + .forceNonParallel(); + + // Sink to collect the results + CollectingSink result = new CollectingSink<>(); + events.sinkTo(result); + + // Start the job + String savepointPath; + JobClient jobClient = null; + AtomicReference firstNonEmptyEvent = new AtomicReference<>(); + try { + jobClient = env.executeAsync("Table Change Source Test"); + + Awaitility.await() + .until( + () -> { + TableChange newEvent = result.poll(Duration.ofSeconds(5L)); + // Fetch every empty event from the beginning + while (newEvent.equals(EMPTY_EVENT)) { + newEvent = result.poll(Duration.ofSeconds(5L)); + } + + // The first non-empty event should contain the expected value + firstNonEmptyEvent.set(newEvent); + return true; + }); + } finally { + // Stop with savepoint + savepointPath = closeJobClient(jobClient, savepointDir); + } + + // Restore from savepoint, create the same topology with a different env + env = StreamExecutionEnvironment.getExecutionEnvironment(); + events = + env.fromSource( + new MonitorSource(tableLoader, LOW_RATE, Long.MAX_VALUE), + WatermarkStrategy.noWatermarks(), + "TableChangeSource") + .forceNonParallel(); + CollectingSink resultWithSavepoint = new CollectingSink<>(); + events.sinkTo(resultWithSavepoint); + + // Make sure that the job with restored source does not read new records from the table + StreamGraph streamGraph = env.getStreamGraph(); + streamGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath)); + CompletableFuture jobIDCompletableFuture = clusterClient.submitJob(streamGraph); + try { + assertThat(resultWithSavepoint.poll(Duration.ofSeconds(5L))).isEqualTo(EMPTY_EVENT); + } finally { + clusterClient.cancel(jobIDCompletableFuture.get()); + } + + // Restore without savepoint + env = StreamExecutionEnvironment.getExecutionEnvironment(); + events = + env.fromSource( + new MonitorSource(tableLoader, LOW_RATE, Long.MAX_VALUE), + WatermarkStrategy.noWatermarks(), + "TableChangeSource") + .forceNonParallel(); + CollectingSink resultWithoutSavepoint = new CollectingSink<>(); + events.sinkTo(resultWithoutSavepoint); + + // Make sure that a new job without state reads the event as expected + JobClient clientWithoutSavepoint = null; + try { + clientWithoutSavepoint = env.executeAsync("Table Change Source Test without savepoint"); + assertThat(resultWithoutSavepoint.poll(Duration.ofSeconds(5L))) + .isEqualTo(firstNonEmptyEvent.get()); + } finally { + closeJobClient(clientWithoutSavepoint); + } + } + + @Test + void testNotOneParallelismThrows() { + createTable(); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + env.fromSource( + new MonitorSource(tableLoader(), HIGH_RATE, Long.MAX_VALUE), + WatermarkStrategy.noWatermarks(), + "TableChangeSource") + .setParallelism(2) + .print(); + + assertThatThrownBy(env::execute) + .isInstanceOf(JobExecutionException.class) + .rootCause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Parallelism should be set to 1"); + } + + @Test + void testMaxReadBack() throws IOException { + Table table = createTable(); + insert(table, 1, "a"); + insert(table, 2, "b"); + insert(table, 3, "c"); + + TableLoader tableLoader = tableLoader(); + + MonitorSource.TableChangeIterator iterator = + new MonitorSource.TableChangeIterator(tableLoader, null, 1); + + // For a single maxReadBack we only get a single change + assertThat(iterator.next().commitCount()).isEqualTo(1); + + iterator = new MonitorSource.TableChangeIterator(tableLoader, null, 2); + + // Expecting 2 commits/snapshots for maxReadBack=2 + assertThat(iterator.next().commitCount()).isEqualTo(2); + + iterator = new MonitorSource.TableChangeIterator(tableLoader, null, Long.MAX_VALUE); + + // For maxReadBack Long.MAX_VALUE we get every change + assertThat(iterator.next().commitCount()).isEqualTo(3); + } + + @Test + void testSkipReplace() throws IOException { + Table table = createTable(); + insert(table, 1, "a"); + + TableLoader tableLoader = tableLoader(); + + MonitorSource.TableChangeIterator iterator = + new MonitorSource.TableChangeIterator(tableLoader, null, Long.MAX_VALUE); + + // Read the current snapshot + assertThat(iterator.next().commitCount()).isEqualTo(1); + + // Create a DataOperations.REPLACE snapshot + DataFile dataFile = + table.snapshots().iterator().next().addedDataFiles(table.io()).iterator().next(); + RewriteFiles rewrite = tableLoader.loadTable().newRewrite(); + // Replace the file with itself for testing purposes + rewrite.deleteFile(dataFile); + rewrite.addFile(dataFile); + rewrite.commit(); + + // Check that the rewrite is ignored + assertThat(iterator.next()).isEqualTo(EMPTY_EVENT); + } + + private static long firstFileLength(Table table) { + return table.currentSnapshot().addedDataFiles(table.io()).iterator().next().fileSizeInBytes(); + } + + private static TableChange tableChangeWithLastSnapshot(Table table, TableChange previous) { + List dataFiles = + Lists.newArrayList(table.currentSnapshot().addedDataFiles(table.io()).iterator()); + List deleteFiles = + Lists.newArrayList(table.currentSnapshot().addedDeleteFiles(table.io()).iterator()); + + long dataSize = dataFiles.stream().mapToLong(ContentFile::fileSizeInBytes).sum(); + long deleteRecordCount = deleteFiles.stream().mapToLong(DeleteFile::recordCount).sum(); + + TableChange newChange = previous.copy(); + newChange.merge( + TableChange.builder() + .dataFileCount(dataFiles.size()) + .dataFileSizeInBytes(dataSize) + // Currently we only test with equality deletes + .eqDeleteFileCount(deleteFiles.size()) + .eqDeleteRecordCount(deleteRecordCount) + .commitCount(1) + .build()); + return newChange; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestOrphanFilesDetector.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestOrphanFilesDetector.java new file mode 100644 index 000000000000..de3d01409b9d --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestOrphanFilesDetector.java @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Map; +import java.util.concurrent.ConcurrentLinkedQueue; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.util.KeyedTwoInputStreamOperatorTestHarness; +import org.apache.flink.streaming.util.ProcessFunctionTestHarnesses; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.actions.DeleteOrphanFiles.PrefixMismatchMode; +import org.apache.iceberg.actions.FileURI; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.Test; + +public class TestOrphanFilesDetector extends OperatorTestBase { + private static final Map EQUAL_SCHEMES = + Maps.newHashMap( + ImmutableMap.of( + "s3n", "s3", + "s3a", "s3")); + private static final Map EQUAL_AUTHORITIES = Maps.newHashMap(); + private static final String SCHEME_FILE_1 = "s3:/fileName1"; + private static final String AUTHORITY_FILE_1 = "s3://HDFS1002060/fileName1"; + private static final String ONE_AUTHORITY_SCHEME_FILE_1 = "s3a://HDFS1002060/fileName1"; + private static final String TWO_AUTHORITY_SCHEME_FILE_1 = "s3b://HDFS1002060/fileName1"; + + @Test + void testFileSystemFirst() throws Exception { + try (KeyedTwoInputStreamOperatorTestHarness testHarness = + testHarness()) { + testHarness.open(); + + testHarness.processElement2(SCHEME_FILE_1, EVENT_TIME); + testHarness.processElement2(SCHEME_FILE_1, EVENT_TIME); + testHarness.processWatermark1(WATERMARK); + testHarness.processElement1(SCHEME_FILE_1, EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + testHarness.processWatermark2(WATERMARK); + assertThat(testHarness.extractOutputValues()).isEmpty(); + assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); + } + } + + @Test + void testTableFirst() throws Exception { + try (KeyedTwoInputStreamOperatorTestHarness testHarness = + testHarness()) { + testHarness.open(); + + testHarness.processElement1(SCHEME_FILE_1, EVENT_TIME); + testHarness.processElement2(SCHEME_FILE_1, EVENT_TIME); + testHarness.processWatermark1(WATERMARK); + testHarness.processElement2(SCHEME_FILE_1, EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + testHarness.processWatermark2(WATERMARK); + assertThat(testHarness.extractOutputValues()).isEmpty(); + assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); + } + } + + @Test + void testOnlyFileSystem() throws Exception { + try (KeyedTwoInputStreamOperatorTestHarness testHarness = + testHarness()) { + testHarness.open(); + + testHarness.processElement2(SCHEME_FILE_1, EVENT_TIME); + testHarness.processElement2(SCHEME_FILE_1, EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + testHarness.processBothWatermarks(WATERMARK); + assertThat(testHarness.extractOutputValues()).isEqualTo(ImmutableList.of(SCHEME_FILE_1)); + assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); + } + } + + @Test + void testOnlyTable() throws Exception { + try (KeyedTwoInputStreamOperatorTestHarness testHarness = + testHarness()) { + testHarness.open(); + + testHarness.processElement1(SCHEME_FILE_1, EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + testHarness.processBothWatermarks(WATERMARK); + assertThat(testHarness.extractOutputValues()).isEmpty(); + assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); + } + } + + @Test + void testFileSystemWithAuthority() throws Exception { + try (KeyedTwoInputStreamOperatorTestHarness testHarness = + testHarness()) { + testHarness.open(); + + testHarness.processElement1(SCHEME_FILE_1, EVENT_TIME); + testHarness.processElement2(AUTHORITY_FILE_1, EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + testHarness.processBothWatermarks(WATERMARK); + assertThat(testHarness.extractOutputValues()).isEmpty(); + assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); + } + } + + @Test + void testTableWithAuthority() throws Exception { + try (KeyedTwoInputStreamOperatorTestHarness testHarness = + testHarness()) { + testHarness.open(); + + testHarness.processElement1(AUTHORITY_FILE_1, EVENT_TIME); + testHarness.processElement2(SCHEME_FILE_1, EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + testHarness.processBothWatermarks(WATERMARK); + ConcurrentLinkedQueue> errorList = + testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM); + assertThat(errorList).hasSize(1); + assertThat(errorList.stream().findFirst().get().getValue()) + .isInstanceOf(ValidationException.class); + + assertThat(testHarness.extractOutputValues()).isEmpty(); + } + } + + @Test + void testDiffScheme() throws Exception { + try (KeyedTwoInputStreamOperatorTestHarness testHarness = + testHarness()) { + testHarness.open(); + + testHarness.processElement1(AUTHORITY_FILE_1, EVENT_TIME); + testHarness.processElement2(ONE_AUTHORITY_SCHEME_FILE_1, EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + testHarness.processBothWatermarks(WATERMARK); + assertThat(testHarness.extractOutputValues()).isEmpty(); + assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); + } + } + + @Test + void testUnRegisterScheme() throws Exception { + try (KeyedTwoInputStreamOperatorTestHarness testHarness = + testHarness()) { + testHarness.open(); + + testHarness.processElement1(AUTHORITY_FILE_1, EVENT_TIME); + testHarness.processElement2(TWO_AUTHORITY_SCHEME_FILE_1, EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + testHarness.processBothWatermarks(WATERMARK); + ConcurrentLinkedQueue> errorList = + testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM); + assertThat(errorList).hasSize(1); + assertThat(errorList.stream().findFirst().get().getValue()) + .isInstanceOf(ValidationException.class); + + assertThat(testHarness.extractOutputValues()).isEmpty(); + } + } + + @Test + void testPrefixMismatchModeDelete() throws Exception { + try (KeyedTwoInputStreamOperatorTestHarness testHarness = + testHarness(PrefixMismatchMode.DELETE)) { + testHarness.open(); + + testHarness.processElement1(AUTHORITY_FILE_1, EVENT_TIME); + testHarness.processElement2(SCHEME_FILE_1, EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + testHarness.processBothWatermarks(WATERMARK); + assertThat(testHarness.extractOutputValues()).isEqualTo(ImmutableList.of(SCHEME_FILE_1)); + assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); + } + } + + @Test + void testPrefixMismatchModeIgnore() throws Exception { + try (KeyedTwoInputStreamOperatorTestHarness testHarness = + testHarness(PrefixMismatchMode.IGNORE)) { + testHarness.open(); + + testHarness.processElement1(AUTHORITY_FILE_1, EVENT_TIME); + testHarness.processElement2(SCHEME_FILE_1, EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + testHarness.processBothWatermarks(WATERMARK); + assertThat(testHarness.extractOutputValues()).isEmpty(); + assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); + } + } + + @Test + void testMultiAuthority() throws Exception { + try (KeyedTwoInputStreamOperatorTestHarness testHarness = + testHarness(PrefixMismatchMode.IGNORE)) { + testHarness.open(); + + testHarness.processElement1(TWO_AUTHORITY_SCHEME_FILE_1, EVENT_TIME); + testHarness.processElement1(ONE_AUTHORITY_SCHEME_FILE_1, EVENT_TIME); + testHarness.processElement2(AUTHORITY_FILE_1, EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + testHarness.processBothWatermarks(WATERMARK); + assertThat(testHarness.extractOutputValues()).isEmpty(); + assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); + } + } + + private static KeyedTwoInputStreamOperatorTestHarness testHarness( + PrefixMismatchMode prefixMismatchMode) throws Exception { + return ProcessFunctionTestHarnesses.forKeyedCoProcessFunction( + new OrphanFilesDetector(prefixMismatchMode, EQUAL_SCHEMES, EQUAL_AUTHORITIES), + (KeySelector) + t -> new FileURI(new Path(t).toUri(), EQUAL_SCHEMES, EQUAL_AUTHORITIES).getPath(), + (KeySelector) + t -> new FileURI(new Path(t).toUri(), EQUAL_SCHEMES, EQUAL_AUTHORITIES).getPath(), + BasicTypeInfo.STRING_TYPE_INFO); + } + + private static KeyedTwoInputStreamOperatorTestHarness + testHarness() throws Exception { + return testHarness(PrefixMismatchMode.ERROR); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestSkipOnError.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestSkipOnError.java new file mode 100644 index 000000000000..ce5b7ad82ac1 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestSkipOnError.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; +import org.apache.flink.streaming.util.TwoInputStreamOperatorTestHarness; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +class TestSkipOnError extends OperatorTestBase { + + private static final Exception EXCEPTION = new Exception("Test error"); + + @Test + void testNoFailure() throws Exception { + try (TwoInputStreamOperatorTestHarness testHarness = + new TwoInputStreamOperatorTestHarness<>(new SkipOnError())) { + testHarness.open(); + + testHarness.processElement1(FILE_NAME_1, EVENT_TIME); + testHarness.processElement1(FILE_NAME_2, EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + + testHarness.processBothWatermarks(WATERMARK); + assertThat(testHarness.extractOutputValues()) + .isEqualTo(ImmutableList.of(FILE_NAME_1, FILE_NAME_2)); + } + } + + @Test + void testFailure() throws Exception { + try (TwoInputStreamOperatorTestHarness testHarness = + new TwoInputStreamOperatorTestHarness<>(new SkipOnError())) { + testHarness.open(); + + testHarness.processElement1(FILE_NAME_1, EVENT_TIME); + testHarness.processElement2(EXCEPTION, EVENT_TIME); + testHarness.processElement1(FILE_NAME_2, EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + + testHarness.processBothWatermarks(WATERMARK); + assertThat(testHarness.extractOutputValues()).isEmpty(); + } + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testStateRestore(boolean withError) throws Exception { + OperatorSubtaskState state; + try (TwoInputStreamOperatorTestHarness testHarness = + new TwoInputStreamOperatorTestHarness<>(new SkipOnError())) { + testHarness.open(); + + testHarness.processElement1(FILE_NAME_1, EVENT_TIME); + if (withError) { + testHarness.processElement2(EXCEPTION, EVENT_TIME); + } + + assertThat(testHarness.extractOutputValues()).isEmpty(); + state = testHarness.snapshot(1L, EVENT_TIME); + } + + try (TwoInputStreamOperatorTestHarness testHarness = + new TwoInputStreamOperatorTestHarness<>(new SkipOnError())) { + testHarness.initializeState(state); + testHarness.open(); + + testHarness.processElement1(FILE_NAME_2, EVENT_TIME); + + assertThat(testHarness.extractOutputValues()).isEmpty(); + testHarness.processBothWatermarks(WATERMARK); + if (withError) { + assertThat(testHarness.extractOutputValues()).isEmpty(); + } else { + assertThat(testHarness.extractOutputValues()) + .isEqualTo(ImmutableList.of(FILE_NAME_1, FILE_NAME_2)); + } + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTablePlanerAndReader.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTablePlanerAndReader.java new file mode 100644 index 000000000000..87b0303b488d --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTablePlanerAndReader.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.streaming.util.ProcessFunctionTestHarnesses; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.apache.iceberg.flink.source.ScanContext; +import org.junit.jupiter.api.Test; + +class TestTablePlanerAndReader extends OperatorTestBase { + private static final Schema FILE_PATH_SCHEMA = new Schema(DataFile.FILE_PATH); + private static final ScanContext FILE_PATH_SCAN_CONTEXT = + ScanContext.builder().streaming(true).project(FILE_PATH_SCHEMA).build(); + + @Test + void testTablePlaneAndRead() throws Exception { + Table table = createTable(); + insert(table, 1, "a"); + insert(table, 2, "b"); + List icebergSourceSplits; + try (OneInputStreamOperatorTestHarness testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new MetadataTablePlanner( + OperatorTestBase.DUMMY_TASK_NAME, + 0, + tableLoader(), + FILE_PATH_SCAN_CONTEXT, + MetadataTableType.ALL_FILES, + 1))) { + testHarness.open(); + OperatorTestBase.trigger(testHarness); + icebergSourceSplits = testHarness.extractOutputValues(); + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + } + + try (OneInputStreamOperatorTestHarness testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new FileNameReader( + OperatorTestBase.DUMMY_TASK_NAME, + 0, + tableLoader(), + FILE_PATH_SCHEMA, + FILE_PATH_SCAN_CONTEXT, + MetadataTableType.ALL_FILES))) { + testHarness.open(); + for (MetadataTablePlanner.SplitInfo icebergSourceSplit : icebergSourceSplits) { + testHarness.processElement(icebergSourceSplit, System.currentTimeMillis()); + } + + assertThat(testHarness.extractOutputValues()).hasSize(2); + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + } + } + + @Test + void testTablePlaneAndReadWithPartitionedTable() throws Exception { + Table table = createPartitionedTable(); + insertPartitioned(table, 1, "p1"); + insertPartitioned(table, 2, "p1"); + insertPartitioned(table, 3, "p2"); + insertPartitioned(table, 4, "p2"); + List icebergSourceSplits; + try (OneInputStreamOperatorTestHarness testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new MetadataTablePlanner( + OperatorTestBase.DUMMY_TASK_NAME, + 0, + tableLoader(), + FILE_PATH_SCAN_CONTEXT, + MetadataTableType.ALL_FILES, + 1))) { + testHarness.open(); + OperatorTestBase.trigger(testHarness); + icebergSourceSplits = testHarness.extractOutputValues(); + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + } + + try (OneInputStreamOperatorTestHarness testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new FileNameReader( + OperatorTestBase.DUMMY_TASK_NAME, + 0, + tableLoader(), + FILE_PATH_SCHEMA, + FILE_PATH_SCAN_CONTEXT, + MetadataTableType.ALL_FILES))) { + testHarness.open(); + for (MetadataTablePlanner.SplitInfo icebergSourceSplit : icebergSourceSplits) { + testHarness.processElement(icebergSourceSplit, System.currentTimeMillis()); + } + + assertThat(testHarness.extractOutputValues()).hasSize(4); + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTaskResultAggregator.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTaskResultAggregator.java new file mode 100644 index 000000000000..51d901e923c7 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTaskResultAggregator.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.concurrent.ConcurrentLinkedQueue; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.util.TwoInputStreamOperatorTestHarness; +import org.apache.iceberg.flink.maintenance.api.TaskResult; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.junit.jupiter.api.Test; + +class TestTaskResultAggregator extends OperatorTestBase { + + @Test + void testPassWatermark() throws Exception { + TaskResultAggregator taskResultAggregator = + new TaskResultAggregator("table-name", "task-name", 0); + try (TwoInputStreamOperatorTestHarness testHarness = + new TwoInputStreamOperatorTestHarness<>(taskResultAggregator)) { + testHarness.open(); + testHarness.processBothWatermarks(WATERMARK); + ConcurrentLinkedQueue output = testHarness.getOutput(); + assertThat(output).containsOnlyOnce(WATERMARK); + } + } + + @Test + void testProcessWatermarkWithoutElement() throws Exception { + TaskResultAggregator taskResultAggregator = + new TaskResultAggregator("table-name", "task-name", 0); + try (TwoInputStreamOperatorTestHarness testHarness = + new TwoInputStreamOperatorTestHarness<>(taskResultAggregator)) { + testHarness.open(); + testHarness.processBothWatermarks(WATERMARK); + List taskResults = testHarness.extractOutputValues(); + assertThat(taskResults).hasSize(0); + } + } + + @Test + void testProcessWatermark() throws Exception { + TaskResultAggregator taskResultAggregator = + new TaskResultAggregator("table-name", "task-name", 0); + try (TwoInputStreamOperatorTestHarness testHarness = + new TwoInputStreamOperatorTestHarness<>(taskResultAggregator)) { + testHarness.open(); + + testHarness.processElement1(new StreamRecord<>(Trigger.create(EVENT_TIME, 0))); + testHarness.processBothWatermarks(WATERMARK); + List taskResults = testHarness.extractOutputValues(); + assertThat(taskResults).hasSize(1); + TaskResult taskResult = taskResults.get(0); + assertThat(taskResult.taskIndex()).isEqualTo(0); + assertThat(taskResult.startEpoch()).isEqualTo(EVENT_TIME); + assertThat(taskResult.success()).isEqualTo(true); + assertThat(taskResult.exceptions()).hasSize(0); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTriggerManager.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTriggerManager.java new file mode 100644 index 000000000000..63bea00f346e --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTriggerManager.java @@ -0,0 +1,671 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.CONCURRENT_RUN_THROTTLED; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.NOTHING_TO_TRIGGER; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.RATE_LIMITER_TRIGGERED; +import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.TRIGGERED; +import static org.assertj.core.api.Assertions.assertThat; + +import java.time.Duration; +import java.util.List; +import java.util.stream.Stream; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.operators.KeyedProcessOperator; +import org.apache.flink.streaming.util.KeyedOneInputStreamOperatorTestHarness; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.maintenance.api.Trigger; +import org.apache.iceberg.flink.maintenance.api.TriggerLockFactory; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class TestTriggerManager extends OperatorTestBase { + private static final long DELAY = 10L; + private static final String[] TASKS = new String[] {"task0", "task1"}; + private long processingTime = 0L; + private TriggerLockFactory.Lock lock; + private TriggerLockFactory.Lock recoveringLock; + private String tableName; + + @BeforeEach + void before() { + super.before(); + Table table = createTable(); + this.lock = LOCK_FACTORY.createLock(); + this.recoveringLock = LOCK_FACTORY.createRecoveryLock(); + this.tableName = table.name(); + } + + @Test + void testCommitCount() throws Exception { + TriggerManager manager = + manager(tableLoader(), new TriggerEvaluator.Builder().commitCount(3).build()); + try (KeyedOneInputStreamOperatorTestHarness testHarness = + harness(manager)) { + testHarness.open(); + + addEventAndCheckResult(testHarness, TableChange.builder().commitCount(1).build(), 0); + addEventAndCheckResult(testHarness, TableChange.builder().commitCount(2).build(), 1); + addEventAndCheckResult(testHarness, TableChange.builder().commitCount(3).build(), 2); + addEventAndCheckResult(testHarness, TableChange.builder().commitCount(10).build(), 3); + + // No trigger in this case + addEventAndCheckResult(testHarness, TableChange.builder().commitCount(1).build(), 3); + addEventAndCheckResult(testHarness, TableChange.builder().commitCount(1).build(), 3); + + addEventAndCheckResult(testHarness, TableChange.builder().commitCount(1).build(), 4); + } + } + + @Test + void testDataFileCount() throws Exception { + TriggerManager manager = + manager(tableLoader(), new TriggerEvaluator.Builder().dataFileCount(3).build()); + try (KeyedOneInputStreamOperatorTestHarness testHarness = + harness(manager)) { + testHarness.open(); + + addEventAndCheckResult(testHarness, TableChange.builder().dataFileCount(1).build(), 0); + + addEventAndCheckResult(testHarness, TableChange.builder().dataFileCount(2).build(), 1); + addEventAndCheckResult(testHarness, TableChange.builder().dataFileCount(3).build(), 2); + addEventAndCheckResult(testHarness, TableChange.builder().dataFileCount(5).build(), 3); + + // No trigger in this case + addEventAndCheckResult(testHarness, TableChange.builder().dataFileCount(1).build(), 3); + + addEventAndCheckResult(testHarness, TableChange.builder().dataFileCount(2).build(), 4); + } + } + + @Test + void testDataFileSizeInBytes() throws Exception { + TriggerManager manager = + manager(tableLoader(), new TriggerEvaluator.Builder().dataFileSizeInBytes(3).build()); + try (KeyedOneInputStreamOperatorTestHarness testHarness = + harness(manager)) { + testHarness.open(); + + addEventAndCheckResult(testHarness, TableChange.builder().dataFileSizeInBytes(1L).build(), 0); + addEventAndCheckResult(testHarness, TableChange.builder().dataFileSizeInBytes(2L).build(), 1); + addEventAndCheckResult(testHarness, TableChange.builder().dataFileSizeInBytes(5L).build(), 2); + + // No trigger in this case + addEventAndCheckResult(testHarness, TableChange.builder().dataFileSizeInBytes(1L).build(), 2); + + addEventAndCheckResult(testHarness, TableChange.builder().dataFileSizeInBytes(2L).build(), 3); + } + } + + @Test + void testPosDeleteFileCount() throws Exception { + TriggerManager manager = + manager(tableLoader(), new TriggerEvaluator.Builder().posDeleteFileCount(3).build()); + try (KeyedOneInputStreamOperatorTestHarness testHarness = + harness(manager)) { + testHarness.open(); + + addEventAndCheckResult(testHarness, TableChange.builder().posDeleteFileCount(1).build(), 0); + addEventAndCheckResult(testHarness, TableChange.builder().posDeleteFileCount(2).build(), 1); + addEventAndCheckResult(testHarness, TableChange.builder().posDeleteFileCount(3).build(), 2); + addEventAndCheckResult(testHarness, TableChange.builder().posDeleteFileCount(10).build(), 3); + + // No trigger in this case + addEventAndCheckResult(testHarness, TableChange.builder().posDeleteFileCount(1).build(), 3); + addEventAndCheckResult(testHarness, TableChange.builder().posDeleteFileCount(1).build(), 3); + + addEventAndCheckResult(testHarness, TableChange.builder().posDeleteFileCount(1).build(), 4); + } + } + + @Test + void testPosDeleteRecordCount() throws Exception { + TriggerManager manager = + manager(tableLoader(), new TriggerEvaluator.Builder().posDeleteRecordCount(3).build()); + try (KeyedOneInputStreamOperatorTestHarness testHarness = + harness(manager)) { + testHarness.open(); + + addEventAndCheckResult( + testHarness, TableChange.builder().posDeleteRecordCount(1L).build(), 0); + addEventAndCheckResult( + testHarness, TableChange.builder().posDeleteRecordCount(2L).build(), 1); + addEventAndCheckResult( + testHarness, TableChange.builder().posDeleteRecordCount(5L).build(), 2); + + // No trigger in this case + addEventAndCheckResult( + testHarness, TableChange.builder().posDeleteRecordCount(1L).build(), 2); + + addEventAndCheckResult( + testHarness, TableChange.builder().posDeleteRecordCount(2L).build(), 3); + } + } + + @Test + void testEqDeleteFileCount() throws Exception { + TriggerManager manager = + manager(tableLoader(), new TriggerEvaluator.Builder().eqDeleteFileCount(3).build()); + try (KeyedOneInputStreamOperatorTestHarness testHarness = + harness(manager)) { + testHarness.open(); + + addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteFileCount(1).build(), 0); + addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteFileCount(2).build(), 1); + addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteFileCount(3).build(), 2); + addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteFileCount(10).build(), 3); + + // No trigger in this case + addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteFileCount(1).build(), 3); + addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteFileCount(1).build(), 3); + + addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteFileCount(1).build(), 4); + } + } + + @Test + void testEqDeleteRecordCount() throws Exception { + TriggerManager manager = + manager(tableLoader(), new TriggerEvaluator.Builder().eqDeleteRecordCount(3).build()); + try (KeyedOneInputStreamOperatorTestHarness testHarness = + harness(manager)) { + testHarness.open(); + + addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteRecordCount(1L).build(), 0); + addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteRecordCount(2L).build(), 1); + addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteRecordCount(5L).build(), 2); + + // No trigger in this case + addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteRecordCount(1L).build(), 2); + + addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteRecordCount(2L).build(), 3); + } + } + + @Test + void testTimeout() throws Exception { + TriggerManager manager = + manager( + tableLoader(), new TriggerEvaluator.Builder().timeout(Duration.ofSeconds(1)).build()); + try (KeyedOneInputStreamOperatorTestHarness testHarness = + harness(manager)) { + testHarness.open(); + + TableChange event = TableChange.builder().dataFileCount(1).commitCount(1).build(); + + // Wait for some time + testHarness.processElement(event, EVENT_TIME); + assertThat(testHarness.extractOutputValues()).isEmpty(); + + // Wait for the timeout to expire + long newTime = EVENT_TIME + Duration.ofSeconds(1).toMillis(); + testHarness.setProcessingTime(newTime); + testHarness.processElement(event, newTime); + assertThat(testHarness.extractOutputValues()).hasSize(1); + + // Remove the lock to allow the next trigger + lock.unlock(); + + // Send a new event + testHarness.setProcessingTime(newTime + 1); + testHarness.processElement(event, newTime); + + // No trigger yet + assertThat(testHarness.extractOutputValues()).hasSize(1); + + // Send a new event + newTime += Duration.ofSeconds(1).toMillis(); + testHarness.setProcessingTime(newTime); + testHarness.processElement(event, newTime); + + // New trigger should arrive + assertThat(testHarness.extractOutputValues()).hasSize(2); + } + } + + @Test + void testStateRestore() throws Exception { + TableLoader tableLoader = tableLoader(); + TriggerManager manager = manager(tableLoader); + OperatorSubtaskState state; + try (KeyedOneInputStreamOperatorTestHarness testHarness = + harness(manager)) { + testHarness.open(); + + testHarness.processElement( + TableChange.builder().dataFileCount(1).commitCount(1).build(), EVENT_TIME); + + assertThat(testHarness.extractOutputValues()).isEmpty(); + + state = testHarness.snapshot(1, EVENT_TIME); + } + + // Restore the state, write some more data, create a checkpoint, check the data which is written + manager = manager(tableLoader); + try (KeyedOneInputStreamOperatorTestHarness testHarness = + harness(manager)) { + testHarness.initializeState(state); + testHarness.open(); + + // Arrives the first real change which triggers the recovery process + testHarness.processElement(TableChange.builder().commitCount(1).build(), EVENT_TIME_2); + assertTriggers( + testHarness.extractOutputValues(), + Lists.newArrayList(Trigger.recovery(testHarness.getProcessingTime()))); + + // Remove the lock to allow the next trigger + recoveringLock.unlock(); + testHarness.setProcessingTime(EVENT_TIME_2); + // At this point the output contains the recovery trigger and the real trigger + assertThat(testHarness.extractOutputValues()).hasSize(2); + } + } + + @Test + void testNewJobReleasesExistingLock() throws Exception { + // Lock first to mock previous job orphaned lock + lock.tryLock(); + recoveringLock.tryLock(); + + TableLoader tableLoader = tableLoader(); + TriggerManager manager = manager(tableLoader); + try (KeyedOneInputStreamOperatorTestHarness testHarness = + harness(manager)) { + testHarness.open(); + + // Check the new job weather remove the orphaned lock + assertThat(lock.isHeld()).isFalse(); + assertThat(recoveringLock.isHeld()).isFalse(); + } + } + + @Test + void testMinFireDelay() throws Exception { + TableLoader tableLoader = tableLoader(); + TriggerManager manager = manager(tableLoader, DELAY, 1); + try (KeyedOneInputStreamOperatorTestHarness testHarness = + harness(manager)) { + testHarness.open(); + + addEventAndCheckResult(testHarness, TableChange.builder().commitCount(2).build(), 1); + long currentTime = testHarness.getProcessingTime(); + + // No new fire yet + addEventAndCheckResult(testHarness, TableChange.builder().commitCount(2).build(), 1); + + // Check that the trigger fired after the delay + testHarness.setProcessingTime(currentTime + DELAY); + assertThat(testHarness.extractOutputValues()).hasSize(2); + } + } + + @Test + void testLockCheckDelay() throws Exception { + TableLoader tableLoader = tableLoader(); + TriggerManager manager = manager(tableLoader, 1, DELAY); + try (KeyedOneInputStreamOperatorTestHarness testHarness = + harness(manager)) { + testHarness.open(); + + addEventAndCheckResult(testHarness, TableChange.builder().commitCount(2).build(), 1); + + // Create a lock to prevent execution, and check that there is no result + assertThat(lock.tryLock()).isTrue(); + addEventAndCheckResult(testHarness, TableChange.builder().commitCount(2).build(), 1); + long currentTime = testHarness.getProcessingTime(); + + // Remove the lock, and still no trigger + lock.unlock(); + assertThat(testHarness.extractOutputValues()).hasSize(1); + + // Check that the trigger fired after the delay + testHarness.setProcessingTime(currentTime + DELAY); + assertThat(testHarness.extractOutputValues()).hasSize(2); + } + } + + /** + * Simulating recovery scenarios where there is a leftover table lock, and ongoing maintenance + * task. + * + * @param locked if a lock exists on the table on job recovery + * @param runningTask is running and continues to run after job recovery + */ + @ParameterizedTest + @MethodSource("parametersForTestRecovery") + void testRecovery(boolean locked, boolean runningTask) throws Exception { + TableLoader tableLoader = tableLoader(); + TriggerManager manager = manager(tableLoader); + OperatorSubtaskState state; + try (KeyedOneInputStreamOperatorTestHarness testHarness = + harness(manager)) { + testHarness.open(); + state = testHarness.snapshot(1, EVENT_TIME); + } + + if (locked) { + assertThat(lock.tryLock()).isTrue(); + } + + manager = manager(tableLoader); + List expected = Lists.newArrayListWithExpectedSize(3); + try (KeyedOneInputStreamOperatorTestHarness testHarness = + harness(manager)) { + testHarness.initializeState(state); + testHarness.open(); + + ++processingTime; + expected.add(Trigger.recovery(processingTime)); + testHarness.setProcessingTime(processingTime); + testHarness.processElement(TableChange.builder().commitCount(2).build(), processingTime); + assertTriggers(testHarness.extractOutputValues(), expected); + + // Nothing happens until the recovery is finished + ++processingTime; + testHarness.setProcessingTime(processingTime); + assertTriggers(testHarness.extractOutputValues(), expected); + + if (runningTask) { + // Simulate the action of the recovered maintenance task lock removal when it finishes + lock.unlock(); + } + + // Still no results as the recovery is ongoing + ++processingTime; + testHarness.setProcessingTime(processingTime); + testHarness.processElement(TableChange.builder().commitCount(2).build(), processingTime); + assertTriggers(testHarness.extractOutputValues(), expected); + + // Simulate the action of removing lock and recoveryLock by downstream lock cleaner when it + // received recovery trigger + lock.unlock(); + recoveringLock.unlock(); + + // Emit only a single trigger + ++processingTime; + testHarness.setProcessingTime(processingTime); + // Releasing lock will create a new snapshot, and we receive this in the trigger + expected.add(Trigger.create(processingTime, 0)); + assertTriggers(testHarness.extractOutputValues(), expected); + } + } + + @Test + void testTriggerMetrics() throws Exception { + TableLoader tableLoader = tableLoader(); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + ManualSource source = + new ManualSource<>(env, TypeInformation.of(TableChange.class)); + CollectingSink sink = new CollectingSink<>(); + + TriggerManager manager = + new TriggerManager( + tableLoader, + LOCK_FACTORY, + Lists.newArrayList(TASKS), + Lists.newArrayList( + new TriggerEvaluator.Builder().commitCount(2).build(), + new TriggerEvaluator.Builder().commitCount(4).build()), + 1L, + 1L); + source + .dataStream() + .keyBy(unused -> true) + .process(manager) + .name(DUMMY_TASK_NAME) + .forceNonParallel() + .sinkTo(sink); + + JobClient jobClient = null; + try { + jobClient = env.executeAsync(); + + // This one doesn't trigger - tests NOTHING_TO_TRIGGER + source.sendRecord(TableChange.builder().commitCount(1).build()); + + Awaitility.await() + .until( + () -> { + Long notingCounter = + MetricsReporterFactoryForTests.counter( + ImmutableList.of(DUMMY_TASK_NAME, tableName, NOTHING_TO_TRIGGER)); + return notingCounter != null && notingCounter.equals(1L); + }); + + // Trigger one of the tasks - tests TRIGGERED + source.sendRecord(TableChange.builder().commitCount(1).build()); + // Wait until we receive the trigger + assertThat(sink.poll(Duration.ofSeconds(5))).isNotNull(); + assertThat( + MetricsReporterFactoryForTests.counter( + ImmutableList.of(DUMMY_TASK_NAME, tableName, TASKS[0], "0", TRIGGERED))) + .isEqualTo(1L); + lock.unlock(); + + // Trigger both of the tasks - tests TRIGGERED + source.sendRecord(TableChange.builder().commitCount(2).build()); + // Wait until we receive the trigger + assertThat(sink.poll(Duration.ofSeconds(5))).isNotNull(); + lock.unlock(); + assertThat(sink.poll(Duration.ofSeconds(5))).isNotNull(); + lock.unlock(); + assertThat( + MetricsReporterFactoryForTests.counter( + ImmutableList.of(DUMMY_TASK_NAME, tableName, TASKS[0], "0", TRIGGERED))) + .isEqualTo(2L); + assertThat( + MetricsReporterFactoryForTests.counter( + ImmutableList.of(DUMMY_TASK_NAME, tableName, TASKS[1], "1", TRIGGERED))) + .isEqualTo(1L); + + // Final check all the counters + MetricsReporterFactoryForTests.assertCounters( + new ImmutableMap.Builder, Long>() + .put(ImmutableList.of(DUMMY_TASK_NAME, tableName, RATE_LIMITER_TRIGGERED), -1L) + .put(ImmutableList.of(DUMMY_TASK_NAME, tableName, CONCURRENT_RUN_THROTTLED), -1L) + .put(ImmutableList.of(DUMMY_TASK_NAME, tableName, TASKS[0], "0", TRIGGERED), 2L) + .put(ImmutableList.of(DUMMY_TASK_NAME, tableName, TASKS[1], "1", TRIGGERED), 1L) + .put(ImmutableList.of(DUMMY_TASK_NAME, tableName, NOTHING_TO_TRIGGER), 1L) + .build()); + } finally { + closeJobClient(jobClient); + } + } + + @Test + void testRateLimiterMetrics() throws Exception { + TableLoader tableLoader = tableLoader(); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + ManualSource source = + new ManualSource<>(env, TypeInformation.of(TableChange.class)); + CollectingSink sink = new CollectingSink<>(); + + // High delay, so only triggered once + TriggerManager manager = manager(tableLoader, 1_000_000L, 1L); + source + .dataStream() + .keyBy(unused -> true) + .process(manager) + .name(DUMMY_TASK_NAME) + .forceNonParallel() + .sinkTo(sink); + + JobClient jobClient = null; + try { + jobClient = env.executeAsync(); + + // Start the first trigger + source.sendRecord(TableChange.builder().commitCount(2).build()); + assertThat(sink.poll(Duration.ofSeconds(5))).isNotNull(); + + // Remove the lock to allow the next trigger + lock.unlock(); + + // The second trigger will be blocked + source.sendRecord(TableChange.builder().commitCount(2).build()); + Awaitility.await() + .until( + () -> + MetricsReporterFactoryForTests.counter( + ImmutableList.of(DUMMY_TASK_NAME, tableName, RATE_LIMITER_TRIGGERED)) + .equals(1L)); + + // Final check all the counters + assertCounters(1L, 0L); + } finally { + closeJobClient(jobClient); + } + } + + @Test + void testConcurrentRunMetrics() throws Exception { + TableLoader tableLoader = tableLoader(); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + ManualSource source = + new ManualSource<>(env, TypeInformation.of(TableChange.class)); + CollectingSink sink = new CollectingSink<>(); + + // High delay, so only triggered once + TriggerManager manager = manager(tableLoader, 1L, 1_000_000L); + source + .dataStream() + .keyBy(unused -> true) + .process(manager) + .name(DUMMY_TASK_NAME) + .forceNonParallel() + .sinkTo(sink); + + JobClient jobClient = null; + try { + jobClient = env.executeAsync(); + + // Start the first trigger - notice that we do not remove the lock after the trigger + source.sendRecord(TableChange.builder().commitCount(2).build()); + assertThat(sink.poll(Duration.ofSeconds(5))).isNotNull(); + + // The second trigger will be blocked by the lock + source.sendRecord(TableChange.builder().commitCount(2).build()); + Awaitility.await() + .until( + () -> + MetricsReporterFactoryForTests.counter( + ImmutableList.of(DUMMY_TASK_NAME, tableName, CONCURRENT_RUN_THROTTLED)) + .equals(1L)); + + // Final check all the counters + assertCounters(0L, 1L); + } finally { + closeJobClient(jobClient); + } + } + + private static Stream parametersForTestRecovery() { + return Stream.of( + Arguments.of(true, true), + Arguments.of(true, false), + Arguments.of(false, true), + Arguments.of(false, false)); + } + + private void assertCounters(long rateLimiterTrigger, long concurrentRunTrigger) { + MetricsReporterFactoryForTests.assertCounters( + new ImmutableMap.Builder, Long>() + .put( + ImmutableList.of(DUMMY_TASK_NAME, tableName, RATE_LIMITER_TRIGGERED), + rateLimiterTrigger) + .put( + ImmutableList.of(DUMMY_TASK_NAME, tableName, CONCURRENT_RUN_THROTTLED), + concurrentRunTrigger) + .put(ImmutableList.of(DUMMY_TASK_NAME, tableName, TASKS[0], "0", TRIGGERED), 1L) + .put(ImmutableList.of(DUMMY_TASK_NAME, tableName, NOTHING_TO_TRIGGER), 0L) + .build()); + } + + private KeyedOneInputStreamOperatorTestHarness harness( + TriggerManager manager) throws Exception { + return new KeyedOneInputStreamOperatorTestHarness<>( + new KeyedProcessOperator<>(manager), value -> true, Types.BOOLEAN); + } + + private void addEventAndCheckResult( + OneInputStreamOperatorTestHarness testHarness, + TableChange event, + int expectedSize) + throws Exception { + ++processingTime; + testHarness.setProcessingTime(processingTime); + testHarness.processElement(event, processingTime); + assertThat(testHarness.extractOutputValues()).hasSize(expectedSize); + // Remove the lock to allow the next trigger + lock.unlock(); + } + + private TriggerManager manager(TableLoader tableLoader, TriggerEvaluator evaluator) { + return new TriggerManager( + tableLoader, + LOCK_FACTORY, + Lists.newArrayList(TASKS[0]), + Lists.newArrayList(evaluator), + 1, + 1); + } + + private TriggerManager manager( + TableLoader tableLoader, long minFireDelayMs, long lockCheckDelayMs) { + return new TriggerManager( + tableLoader, + LOCK_FACTORY, + Lists.newArrayList(TASKS[0]), + Lists.newArrayList(new TriggerEvaluator.Builder().commitCount(2).build()), + minFireDelayMs, + lockCheckDelayMs); + } + + private TriggerManager manager(TableLoader tableLoader) { + return manager(tableLoader, new TriggerEvaluator.Builder().commitCount(2).build()); + } + + private static void assertTriggers(List expected, List actual) { + assertThat(actual).hasSize(expected.size()); + for (int i = 0; i < expected.size(); ++i) { + Trigger expectedTrigger = expected.get(i); + Trigger actualTrigger = actual.get(i); + assertThat(actualTrigger.timestamp()).isEqualTo(expectedTrigger.timestamp()); + assertThat(actualTrigger.taskId()).isEqualTo(expectedTrigger.taskId()); + assertThat(actualTrigger.isRecovery()).isEqualTo(expectedTrigger.isRecovery()); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/SinkTestUtil.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/SinkTestUtil.java new file mode 100644 index 000000000000..1cf55bcdc817 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/SinkTestUtil.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Collection; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; +import org.apache.flink.streaming.api.connector.sink2.CommittableSummary; +import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; +import org.apache.flink.streaming.runtime.streamrecord.StreamElement; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; + +class SinkTestUtil { + + private SinkTestUtil() {} + + @SuppressWarnings("unchecked") + static List transformsToStreamElement(Collection elements) { + return elements.stream() + .map( + element -> { + if (element instanceof StreamRecord) { + return new StreamRecord<>( + ((StreamRecord>) element).getValue()); + } + return (StreamElement) element; + }) + .collect(Collectors.toList()); + } + + static CommittableSummary extractAndAssertCommittableSummary(StreamElement element) { + final Object value = element.asRecord().getValue(); + assertThat(value).isInstanceOf(CommittableSummary.class); + return (CommittableSummary) value; + } + + static CommittableWithLineage extractAndAssertCommittableWithLineage( + StreamElement element) { + final Object value = element.asRecord().getValue(); + assertThat(value).isInstanceOf(CommittableWithLineage.class); + return (CommittableWithLineage) value; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java new file mode 100644 index 000000000000..44eb907a17aa --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.flink.AvroGenericRecordConverterBase; +import org.apache.iceberg.flink.DataGenerator; + +public class TestAvroGenericRecordToRowDataMapper extends AvroGenericRecordConverterBase { + @Override + protected void testConverter(DataGenerator dataGenerator) throws Exception { + // Need to use avroSchema from DataGenerator because some primitive types have special Avro + // type handling. Hence the Avro schema converted from Iceberg schema won't work. + AvroGenericRecordToRowDataMapper mapper = + AvroGenericRecordToRowDataMapper.forAvroSchema(dataGenerator.avroSchema()); + RowData expected = dataGenerator.generateFlinkRowData(); + RowData actual = mapper.map(dataGenerator.generateAvroGenericRecord()); + assertThat(actual).isEqualTo(expected); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java new file mode 100644 index 000000000000..abac605f81fd --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatExceptionOfType; + +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.sink.TestBucketPartitionerUtil.TableSchemaType; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +public class TestBucketPartitionKeySelector { + + @ParameterizedTest + @EnumSource( + value = TableSchemaType.class, + names = {"ONE_BUCKET", "IDENTITY_AND_BUCKET"}) + public void testCorrectKeySelection(TableSchemaType tableSchemaType) { + int numBuckets = 60; + + PartitionSpec partitionSpec = tableSchemaType.getPartitionSpec(numBuckets); + BucketPartitionKeySelector keySelector = + new BucketPartitionKeySelector( + partitionSpec, SimpleDataUtil.SCHEMA, SimpleDataUtil.ROW_TYPE); + + TestBucketPartitionerUtil.generateRowsForBucketIdRange(2, numBuckets) + .forEach( + rowData -> { + int expectedBucketId = + TestBucketPartitionerUtil.computeBucketId( + numBuckets, rowData.getString(1).toString()); + Integer key = keySelector.getKey(rowData); + assertThat(key).isEqualTo(expectedBucketId); + }); + } + + @Test + public void testKeySelectorMultipleBucketsFail() { + PartitionSpec partitionSpec = TableSchemaType.TWO_BUCKETS.getPartitionSpec(1); + + assertThatExceptionOfType(RuntimeException.class) + .isThrownBy( + () -> + new BucketPartitionKeySelector( + partitionSpec, SimpleDataUtil.SCHEMA, SimpleDataUtil.ROW_TYPE)) + .withMessage(BucketPartitionerUtil.BAD_NUMBER_OF_BUCKETS_ERROR_MESSAGE, 2); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java new file mode 100644 index 000000000000..59bdba578ebb --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.flink.sink.BucketPartitioner.BUCKET_GREATER_THAN_UPPER_BOUND_MESSAGE; +import static org.apache.iceberg.flink.sink.BucketPartitioner.BUCKET_LESS_THAN_LOWER_BOUND_MESSAGE; +import static org.apache.iceberg.flink.sink.BucketPartitioner.BUCKET_NULL_MESSAGE; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatExceptionOfType; + +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.flink.sink.TestBucketPartitionerUtil.TableSchemaType; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; + +public class TestBucketPartitioner { + + static final int DEFAULT_NUM_BUCKETS = 60; + + @ParameterizedTest + @CsvSource({"ONE_BUCKET,50", "IDENTITY_AND_BUCKET,50", "ONE_BUCKET,60", "IDENTITY_AND_BUCKET,60"}) + public void testPartitioningParallelismGreaterThanBuckets( + String schemaTypeStr, String numBucketsStr) { + int numPartitions = 500; + TableSchemaType tableSchemaType = TableSchemaType.valueOf(schemaTypeStr); + int numBuckets = Integer.parseInt(numBucketsStr); + PartitionSpec partitionSpec = tableSchemaType.getPartitionSpec(numBuckets); + BucketPartitioner bucketPartitioner = new BucketPartitioner(partitionSpec); + + int bucketId = 0; + for (int expectedIndex = 0; expectedIndex < numPartitions; expectedIndex++) { + int actualPartitionIndex = bucketPartitioner.partition(bucketId, numPartitions); + assertThat(actualPartitionIndex).isEqualTo(expectedIndex); + bucketId++; + if (bucketId == numBuckets) { + bucketId = 0; + } + } + } + + @ParameterizedTest + @CsvSource({"ONE_BUCKET,50", "IDENTITY_AND_BUCKET,50", "ONE_BUCKET,60", "IDENTITY_AND_BUCKET,60"}) + public void testPartitioningParallelismEqualLessThanBuckets( + String schemaTypeStr, String numBucketsStr) { + int numPartitions = 30; + TableSchemaType tableSchemaType = TableSchemaType.valueOf(schemaTypeStr); + int numBuckets = Integer.parseInt(numBucketsStr); + PartitionSpec partitionSpec = tableSchemaType.getPartitionSpec(numBuckets); + BucketPartitioner bucketPartitioner = new BucketPartitioner(partitionSpec); + + for (int bucketId = 0; bucketId < numBuckets; bucketId++) { + int actualPartitionIndex = bucketPartitioner.partition(bucketId, numPartitions); + assertThat(actualPartitionIndex).isEqualTo(bucketId % numPartitions); + } + } + + @Test + public void testPartitionerBucketIdNullFail() { + PartitionSpec partitionSpec = TableSchemaType.ONE_BUCKET.getPartitionSpec(DEFAULT_NUM_BUCKETS); + BucketPartitioner bucketPartitioner = new BucketPartitioner(partitionSpec); + + assertThatExceptionOfType(RuntimeException.class) + .isThrownBy(() -> bucketPartitioner.partition(null, DEFAULT_NUM_BUCKETS)) + .withMessage(BUCKET_NULL_MESSAGE); + } + + @Test + public void testPartitionerMultipleBucketsFail() { + PartitionSpec partitionSpec = TableSchemaType.TWO_BUCKETS.getPartitionSpec(DEFAULT_NUM_BUCKETS); + + assertThatExceptionOfType(RuntimeException.class) + .isThrownBy(() -> new BucketPartitioner(partitionSpec)) + .withMessage(BucketPartitionerUtil.BAD_NUMBER_OF_BUCKETS_ERROR_MESSAGE, 2); + } + + @Test + public void testPartitionerBucketIdOutOfRangeFail() { + PartitionSpec partitionSpec = TableSchemaType.ONE_BUCKET.getPartitionSpec(DEFAULT_NUM_BUCKETS); + BucketPartitioner bucketPartitioner = new BucketPartitioner(partitionSpec); + + int negativeBucketId = -1; + assertThatExceptionOfType(IllegalArgumentException.class) + .isThrownBy(() -> bucketPartitioner.partition(negativeBucketId, 1)) + .withMessage(BUCKET_LESS_THAN_LOWER_BOUND_MESSAGE, negativeBucketId); + + int tooBigBucketId = DEFAULT_NUM_BUCKETS; + assertThatExceptionOfType(IllegalArgumentException.class) + .isThrownBy(() -> bucketPartitioner.partition(tooBigBucketId, 1)) + .withMessage(BUCKET_GREATER_THAN_UPPER_BOUND_MESSAGE, tooBigBucketId, DEFAULT_NUM_BUCKETS); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java new file mode 100644 index 000000000000..caf0ac6f21d8 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java @@ -0,0 +1,233 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; +import static org.apache.iceberg.flink.TestFixtures.DATABASE; +import static org.apache.iceberg.flink.TestFixtures.TABLE_IDENTIFIER; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.util.DataFormatConverters; +import org.apache.flink.table.runtime.typeutils.ExternalTypeInfo; +import org.apache.flink.table.types.DataType; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.sink.TestBucketPartitionerUtil.TableSchemaType; +import org.apache.iceberg.flink.source.BoundedTestSource; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +public class TestBucketPartitionerFlinkIcebergSink { + + private static final int NUMBER_TASK_MANAGERS = 1; + private static final int SLOTS_PER_TASK_MANAGER = 8; + + @RegisterExtension + private static final MiniClusterExtension MINI_CLUSTER_EXTENSION = + new MiniClusterExtension( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(NUMBER_TASK_MANAGERS) + .setNumberSlotsPerTaskManager(SLOTS_PER_TASK_MANAGER) + .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) + .build()); + + @RegisterExtension + private static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); + + private static final TypeInformation ROW_TYPE_INFO = + new RowTypeInfo( + SimpleDataUtil.FLINK_SCHEMA.getColumnDataTypes().stream() + .map(ExternalTypeInfo::of) + .toArray(TypeInformation[]::new)); + + // Parallelism = 8 (parallelism > numBuckets) throughout the test suite + private final int parallelism = NUMBER_TASK_MANAGERS * SLOTS_PER_TASK_MANAGER; + private final FileFormat format = FileFormat.PARQUET; + private final int numBuckets = 4; + + private Table table; + private StreamExecutionEnvironment env; + private TableLoader tableLoader; + + private void setupEnvironment(TableSchemaType tableSchemaType) { + PartitionSpec partitionSpec = tableSchemaType.getPartitionSpec(numBuckets); + table = + CATALOG_EXTENSION + .catalog() + .createTable( + TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + partitionSpec, + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); + env = + StreamExecutionEnvironment.getExecutionEnvironment(DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism * 2); + tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + private void appendRowsToTable(List allRows) throws Exception { + DataFormatConverters.RowConverter converter = + new DataFormatConverters.RowConverter( + SimpleDataUtil.FLINK_SCHEMA.getColumnDataTypes().toArray(DataType[]::new)); + + DataStream dataStream = + env.addSource( + new BoundedTestSource<>( + allRows.stream().map(converter::toExternal).toArray(Row[]::new)), + ROW_TYPE_INFO) + .map(converter::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)) + .partitionCustom( + new BucketPartitioner(table.spec()), + new BucketPartitionKeySelector( + table.spec(), + table.schema(), + FlinkSink.toFlinkRowType(table.schema(), SimpleDataUtil.FLINK_SCHEMA))); + + FlinkSink.forRowData(dataStream) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .distributionMode(DistributionMode.NONE) + .append(); + + env.execute("Test Iceberg DataStream"); + + SimpleDataUtil.assertTableRows(table, allRows); + } + + @ParameterizedTest + @EnumSource( + value = TableSchemaType.class, + names = {"ONE_BUCKET", "IDENTITY_AND_BUCKET"}) + public void testSendRecordsToAllBucketsEvenly(TableSchemaType tableSchemaType) throws Exception { + setupEnvironment(tableSchemaType); + List rows = generateTestDataRows(); + + appendRowsToTable(rows); + TableTestStats stats = extractPartitionResults(tableSchemaType); + + assertThat(stats.totalRowCount).isEqualTo(rows.size()); + // All 4 buckets should've been written to + assertThat(stats.writersPerBucket).hasSize(numBuckets); + assertThat(stats.numFilesPerBucket).hasSize(numBuckets); + // Writer expectation (2 writers per bucket): + // - Bucket0 -> Writers [0, 4] + // - Bucket1 -> Writers [1, 5] + // - Bucket2 -> Writers [2, 6] + // - Bucket3 -> Writers [3, 7] + for (int i = 0, j = numBuckets; i < numBuckets; i++, j++) { + assertThat(stats.writersPerBucket.get(i)).hasSameElementsAs(Arrays.asList(i, j)); + // 2 files per bucket (one file is created by each writer) + assertThat(stats.numFilesPerBucket.get(i)).isEqualTo(2); + // 2 rows per file (total of 16 rows across 8 files) + assertThat(stats.rowsPerWriter.get(i)).isEqualTo(2); + } + } + + /** + * Generating 16 rows to be sent uniformly to all writers (round-robin across 8 writers -> 4 + * buckets) + */ + private List generateTestDataRows() { + int totalNumRows = parallelism * 2; + int numRowsPerBucket = totalNumRows / numBuckets; + return TestBucketPartitionerUtil.generateRowsForBucketIdRange(numRowsPerBucket, numBuckets); + } + + private TableTestStats extractPartitionResults(TableSchemaType tableSchemaType) + throws IOException { + int totalRecordCount = 0; + Map> writersPerBucket = Maps.newHashMap(); // > + Map filesPerBucket = Maps.newHashMap(); // + Map rowsPerWriter = Maps.newHashMap(); // + + try (CloseableIterable fileScanTasks = table.newScan().planFiles()) { + for (FileScanTask scanTask : fileScanTasks) { + long recordCountInFile = scanTask.file().recordCount(); + + String[] splitFilePath = scanTask.file().location().split("/"); + // Filename example: 00007-0-a7d3a29a-33e9-4740-88f4-0f494397d60c-00001.parquet + // Writer ID: .......^^^^^ + String filename = splitFilePath[splitFilePath.length - 1]; + int writerId = Integer.parseInt(filename.split("-")[0]); + + totalRecordCount += recordCountInFile; + int bucketId = + scanTask + .file() + .partition() + .get(tableSchemaType.bucketPartitionColumnPosition(), Integer.class); + writersPerBucket.computeIfAbsent(bucketId, k -> Lists.newArrayList()); + writersPerBucket.get(bucketId).add(writerId); + filesPerBucket.put(bucketId, filesPerBucket.getOrDefault(bucketId, 0) + 1); + rowsPerWriter.put(writerId, rowsPerWriter.getOrDefault(writerId, 0L) + recordCountInFile); + } + } + + return new TableTestStats(totalRecordCount, writersPerBucket, filesPerBucket, rowsPerWriter); + } + + /** DTO to hold Test Stats */ + private static class TableTestStats { + final int totalRowCount; + final Map> writersPerBucket; + final Map numFilesPerBucket; + final Map rowsPerWriter; + + TableTestStats( + int totalRecordCount, + Map> writersPerBucket, + Map numFilesPerBucket, + Map rowsPerWriter) { + this.totalRowCount = totalRecordCount; + this.writersPerBucket = writersPerBucket; + this.numFilesPerBucket = numFilesPerBucket; + this.rowsPerWriter = rowsPerWriter; + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java new file mode 100644 index 000000000000..e1309bfac6d5 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.List; +import java.util.UUID; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.BucketUtil; + +final class TestBucketPartitionerUtil { + + enum TableSchemaType { + ONE_BUCKET { + @Override + public int bucketPartitionColumnPosition() { + return 0; + } + + @Override + public PartitionSpec getPartitionSpec(int numBuckets) { + return PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("data", numBuckets).build(); + } + }, + IDENTITY_AND_BUCKET { + @Override + public int bucketPartitionColumnPosition() { + return 1; + } + + @Override + public PartitionSpec getPartitionSpec(int numBuckets) { + return PartitionSpec.builderFor(SimpleDataUtil.SCHEMA) + .identity("id") + .bucket("data", numBuckets) + .build(); + } + }, + TWO_BUCKETS { + @Override + public int bucketPartitionColumnPosition() { + return 1; + } + + @Override + public PartitionSpec getPartitionSpec(int numBuckets) { + return PartitionSpec.builderFor(SimpleDataUtil.SCHEMA) + .bucket("id", numBuckets) + .bucket("data", numBuckets) + .build(); + } + }; + + public abstract int bucketPartitionColumnPosition(); + + public abstract PartitionSpec getPartitionSpec(int numBuckets); + } + + private TestBucketPartitionerUtil() {} + + /** + * Utility method to generate rows whose values will "hash" to a range of bucketIds (from 0 to + * numBuckets - 1) + * + * @param numRowsPerBucket how many different rows should be generated per bucket + * @param numBuckets max number of buckets to consider + * @return the list of rows whose data "hashes" to the desired bucketId + */ + static List generateRowsForBucketIdRange(int numRowsPerBucket, int numBuckets) { + List rows = Lists.newArrayListWithCapacity(numBuckets * numRowsPerBucket); + // For some of our tests, this order of the generated rows matters + for (int i = 0; i < numRowsPerBucket; i++) { + for (int bucketId = 0; bucketId < numBuckets; bucketId++) { + String value = generateValueForBucketId(bucketId, numBuckets); + rows.add(GenericRowData.of(1, StringData.fromString(value))); + } + } + return rows; + } + + /** + * Utility method to generate a UUID string that will "hash" to a desired bucketId + * + * @param bucketId the desired bucketId + * @return the string data that "hashes" to the desired bucketId + */ + private static String generateValueForBucketId(int bucketId, int numBuckets) { + while (true) { + String uuid = UUID.randomUUID().toString(); + if (computeBucketId(numBuckets, uuid) == bucketId) { + return uuid; + } + } + } + + /** + * Utility that performs the same hashing/bucketing mechanism used by Bucket.java + * + * @param numBuckets max number of buckets to consider + * @param value the string to compute the bucketId from + * @return the computed bucketId + */ + static int computeBucketId(int numBuckets, String value) { + return (BucketUtil.hash(value) & Integer.MAX_VALUE) % numBuckets; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java new file mode 100644 index 000000000000..360db658cd2f --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.time.Duration; +import java.util.concurrent.TimeUnit; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.Test; + +public class TestCachingTableSupplier { + + @Test + public void testCheckArguments() { + SerializableTable initialTable = mock(SerializableTable.class); + + Table loadedTable = mock(Table.class); + TableLoader tableLoader = mock(TableLoader.class); + when(tableLoader.loadTable()).thenReturn(loadedTable); + + new CachingTableSupplier(initialTable, tableLoader, Duration.ofMillis(100)); + + assertThatThrownBy(() -> new CachingTableSupplier(initialTable, tableLoader, null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("tableRefreshInterval cannot be null"); + assertThatThrownBy(() -> new CachingTableSupplier(null, tableLoader, Duration.ofMillis(100))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("initialTable cannot be null"); + assertThatThrownBy(() -> new CachingTableSupplier(initialTable, null, Duration.ofMillis(100))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("tableLoader cannot be null"); + } + + @Test + public void testTableReload() { + SerializableTable initialTable = mock(SerializableTable.class); + + Table loadedTable = mock(Table.class); + TableLoader tableLoader = mock(TableLoader.class); + when(tableLoader.loadTable()).thenReturn(loadedTable); + + CachingTableSupplier cachingTableSupplier = + new CachingTableSupplier(initialTable, tableLoader, Duration.ofMillis(100)); + + // refresh shouldn't do anything as the min reload interval hasn't passed + cachingTableSupplier.refreshTable(); + assertThat(cachingTableSupplier.get()).isEqualTo(initialTable); + + // refresh after waiting past the min reload interval + Awaitility.await() + .atLeast(100, TimeUnit.MILLISECONDS) + .untilAsserted( + () -> { + cachingTableSupplier.refreshTable(); + assertThat(cachingTableSupplier.get()).isEqualTo(loadedTable); + }); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestCommittableToTableChangeConverter.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestCommittableToTableChangeConverter.java new file mode 100644 index 000000000000..0c7a47c23230 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestCommittableToTableChangeConverter.java @@ -0,0 +1,319 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.core.io.SimpleVersionedSerialization; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; +import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.streaming.util.ProcessFunctionTestHarnesses; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileMetadata; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.maintenance.operator.TableChange; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +class TestCommittableToTableChangeConverter { + @TempDir private File tempDir; + private Table table; + private FileIO fileIO; + private String tableName; + private Map specs; + private DataFile dataFile; + private DataFile dataFile1; + private DeleteFile posDeleteFile; + private DeleteFile eqDeleteFile; + + @BeforeEach + public void before() throws Exception { + String warehouse = tempDir.getAbsolutePath(); + + String tablePath = warehouse.concat("/test"); + assertThat(new File(tablePath).mkdir()).as("Should create the table path correctly.").isTrue(); + table = SimpleDataUtil.createTable(tablePath, Maps.newHashMap(), false); + fileIO = table.io(); + tableName = table.name(); + specs = table.specs(); + dataFile = + DataFiles.builder(table.spec()) + .withPath("/path/to/data.parquet") + .withFileSizeInBytes(100) + .withRecordCount(10) + .build(); + dataFile1 = + DataFiles.builder(table.spec()) + .withPath("/path/to/data1.parquet") + .withFileSizeInBytes(101) + .withRecordCount(11) + .build(); + posDeleteFile = + FileMetadata.deleteFileBuilder(table.spec()) + .ofPositionDeletes() + .withPath("/path/to/pos-deletes.parquet") + .withFileSizeInBytes(50) + .withRecordCount(5) + .build(); + eqDeleteFile = + FileMetadata.deleteFileBuilder(table.spec()) + .ofEqualityDeletes(1) + .withPath("/path/to/eq-deletes.parquet") + .withFileSizeInBytes(30) + .withRecordCount(3) + .build(); + } + + @Test + public void testConvertWriteResultToTableChange() throws Exception { + String flinkJobId = newFlinkJobId(); + String operatorId = newOperatorUniqueId(); + ManifestOutputFileFactory factory = + FlinkManifestUtil.createOutputFileFactory( + () -> table, table.properties(), flinkJobId, operatorId, 1, 1); + + try (OneInputStreamOperatorTestHarness, TableChange> + harness = + ProcessFunctionTestHarnesses.forProcessFunction( + new CommittableToTableChangeConverter(fileIO, tableName, specs))) { + harness.open(); + WriteResult writeResult = + WriteResult.builder() + .addDataFiles(dataFile) + .addDeleteFiles(posDeleteFile, eqDeleteFile) + .build(); + DeltaManifests deltaManifests = + FlinkManifestUtil.writeCompletedFiles(writeResult, () -> factory.create(1), table.spec()); + IcebergCommittable committable = + new IcebergCommittable( + SimpleVersionedSerialization.writeVersionAndSerialize( + DeltaManifestsSerializer.INSTANCE, deltaManifests), + flinkJobId, + operatorId, + 1L); + CommittableWithLineage message = + new CommittableWithLineage<>(committable, 1L, 0); + harness.processElement(new StreamRecord<>(message)); + TableChange tableChange = harness.extractOutputValues().get(0); + TableChange expectedTableChange = + TableChange.builder() + .dataFileCount(1) + .dataFileSizeInBytes(100) + .posDeleteFileCount(1) + .posDeleteRecordCount(5) + .eqDeleteFileCount(1) + .eqDeleteRecordCount(3) + .commitCount(1) + .build(); + + assertThat(tableChange).isEqualTo(expectedTableChange); + } + } + + @Test + public void testConvertReplays() throws Exception { + String flinkJobId = newFlinkJobId(); + String operatorId = newOperatorUniqueId(); + ManifestOutputFileFactory factory = + FlinkManifestUtil.createOutputFileFactory( + () -> table, table.properties(), flinkJobId, operatorId, 1, 1); + + try (OneInputStreamOperatorTestHarness, TableChange> + harness = + ProcessFunctionTestHarnesses.forProcessFunction( + new CommittableToTableChangeConverter(fileIO, tableName, specs))) { + harness.open(); + + Tuple2, DeltaManifests> icebergCommittable = + createIcebergCommittable( + dataFile, posDeleteFile, eqDeleteFile, factory, table, flinkJobId, operatorId, 1L); + harness.processElement(new StreamRecord<>(icebergCommittable.f0)); + // Duplicate data should be handled properly to avoid job failure. + harness.processElement(new StreamRecord<>(icebergCommittable.f0)); + List tableChanges = harness.extractOutputValues(); + assertThat(tableChanges).hasSize(1); + TableChange tableChange = tableChanges.get(0); + TableChange expectedTableChange = + TableChange.builder() + .dataFileCount(1) + .dataFileSizeInBytes(100) + .posDeleteFileCount(1) + .posDeleteRecordCount(5) + .eqDeleteFileCount(1) + .eqDeleteRecordCount(3) + .commitCount(1) + .build(); + + assertThat(tableChange).isEqualTo(expectedTableChange); + } + } + + @Test + public void testReadUnExistManifest() throws Exception { + String flinkJobId = newFlinkJobId(); + String operatorId = newOperatorUniqueId(); + ManifestOutputFileFactory factory = + FlinkManifestUtil.createOutputFileFactory( + () -> table, table.properties(), flinkJobId, operatorId, 1, 1); + + try (OneInputStreamOperatorTestHarness, TableChange> + harness = + ProcessFunctionTestHarnesses.forProcessFunction( + new CommittableToTableChangeConverter(fileIO, tableName, specs))) { + harness.open(); + + Tuple2, DeltaManifests> icebergCommittable = + createIcebergCommittable( + dataFile, posDeleteFile, eqDeleteFile, factory, table, flinkJobId, operatorId, 1L); + + for (ManifestFile manifest : icebergCommittable.f1.manifests()) { + fileIO.deleteFile(manifest.path()); + // check Manifest files are deleted + assertThat(new File(manifest.path())).doesNotExist(); + } + + // Emit the same committable to check read no exist manifest + // should be handled properly to avoid job failure. + harness.processElement(new StreamRecord<>(icebergCommittable.f0)); + + Tuple2, DeltaManifests> icebergCommittable1 = + createIcebergCommittable( + dataFile1, posDeleteFile, eqDeleteFile, factory, table, flinkJobId, operatorId, 1L); + + harness.processElement(new StreamRecord<>(icebergCommittable1.f0)); + + List tableChanges = harness.extractOutputValues(); + assertThat(tableChanges).hasSize(1); + TableChange tableChange = tableChanges.get(0); + TableChange expectedTableChange = + TableChange.builder() + .dataFileCount(1) + .dataFileSizeInBytes(101) + .posDeleteFileCount(1) + .posDeleteRecordCount(5) + .eqDeleteFileCount(1) + .eqDeleteRecordCount(3) + .commitCount(1) + .build(); + + assertThat(tableChange).isEqualTo(expectedTableChange); + } + } + + @Test + public void testEmptyCommit() throws Exception { + try (OneInputStreamOperatorTestHarness, TableChange> + harness = + ProcessFunctionTestHarnesses.forProcessFunction( + new CommittableToTableChangeConverter(fileIO, tableName, specs))) { + + harness.open(); + IcebergCommittable emptyCommittable = + new IcebergCommittable(new byte[0], "jobId", "operatorId", 1L); + CommittableWithLineage message = + new CommittableWithLineage<>(emptyCommittable, 1L, 0); + harness.processElement(new StreamRecord<>(message)); + List tableChanges = harness.extractOutputValues(); + assertThat(tableChanges).hasSize(0); + } + } + + @Test + public void testManifestDeletion() throws Exception { + String flinkJobId = newFlinkJobId(); + String operatorId = newOperatorUniqueId(); + ManifestOutputFileFactory factory = + FlinkManifestUtil.createOutputFileFactory( + () -> table, table.properties(), flinkJobId, operatorId, 1, 1); + + try (OneInputStreamOperatorTestHarness, TableChange> + harness = + ProcessFunctionTestHarnesses.forProcessFunction( + new CommittableToTableChangeConverter(fileIO, tableName, specs))) { + + harness.open(); + + Tuple2, DeltaManifests> icebergCommittable = + createIcebergCommittable( + dataFile, posDeleteFile, eqDeleteFile, factory, table, flinkJobId, operatorId, 1L); + + harness.processElement(new StreamRecord<>(icebergCommittable.f0)); + + // check Manifest files are deleted + for (ManifestFile manifest : icebergCommittable.f1.manifests()) { + assertThat(new File(manifest.path())).doesNotExist(); + } + } + } + + private static Tuple2, DeltaManifests> + createIcebergCommittable( + DataFile dataFile, + DeleteFile posDeleteFile, + DeleteFile eqDeleteFile, + ManifestOutputFileFactory factory, + Table table, + String flinkJobId, + String operatorId, + long checkpointId) + throws IOException { + WriteResult writeResult = + WriteResult.builder() + .addDataFiles(dataFile) + .addDeleteFiles(posDeleteFile, eqDeleteFile) + .build(); + DeltaManifests deltaManifests = + FlinkManifestUtil.writeCompletedFiles( + writeResult, () -> factory.create(checkpointId), table.spec()); + + IcebergCommittable committable = + new IcebergCommittable( + SimpleVersionedSerialization.writeVersionAndSerialize( + DeltaManifestsSerializer.INSTANCE, deltaManifests), + flinkJobId, + operatorId, + checkpointId); + return Tuple2.of(new CommittableWithLineage<>(committable, checkpointId, 0), deltaManifests); + } + + private static String newFlinkJobId() { + return UUID.randomUUID().toString(); + } + + private static String newOperatorUniqueId() { + return UUID.randomUUID().toString(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java new file mode 100644 index 000000000000..5a74db5713a5 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java @@ -0,0 +1,258 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Map; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.common.DynFields; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.io.BaseTaskWriter; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestCompressionSettings { + @TempDir protected Path temporaryFolder; + + private Table table; + + @Parameter(index = 0) + private Map initProperties; + + @Parameters(name = "tableProperties = {0}") + public static Object[][] parameters() { + return new Object[][] { + new Object[] {ImmutableMap.of()}, + new Object[] { + ImmutableMap.of( + TableProperties.AVRO_COMPRESSION, + "zstd", + TableProperties.AVRO_COMPRESSION_LEVEL, + "3", + TableProperties.PARQUET_COMPRESSION, + "zstd", + TableProperties.PARQUET_COMPRESSION_LEVEL, + "3", + TableProperties.ORC_COMPRESSION, + "zstd", + TableProperties.ORC_COMPRESSION_STRATEGY, + "compression") + } + }; + } + + @BeforeEach + public void before() throws IOException { + File folder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); + table = SimpleDataUtil.createTable(folder.getAbsolutePath(), initProperties, false); + } + + @TestTemplate + public void testCompressionAvro() throws Exception { + // No override provided + Map resultProperties = + appenderProperties( + table, + SimpleDataUtil.FLINK_SCHEMA, + ImmutableMap.of(FlinkWriteOptions.WRITE_FORMAT.key(), "AVRO")); + + if (initProperties.get(TableProperties.AVRO_COMPRESSION) == null) { + assertThat(resultProperties) + .containsEntry(TableProperties.AVRO_COMPRESSION, TableProperties.AVRO_COMPRESSION_DEFAULT) + .doesNotContainKey(TableProperties.AVRO_COMPRESSION_LEVEL); + } else { + assertThat(resultProperties) + .containsEntry( + TableProperties.AVRO_COMPRESSION, + initProperties.get(TableProperties.AVRO_COMPRESSION)) + .containsEntry( + TableProperties.AVRO_COMPRESSION_LEVEL, + initProperties.get(TableProperties.AVRO_COMPRESSION_LEVEL)); + } + + // Override compression to snappy and some random level + resultProperties = + appenderProperties( + table, + SimpleDataUtil.FLINK_SCHEMA, + ImmutableMap.of( + FlinkWriteOptions.WRITE_FORMAT.key(), + "AVRO", + FlinkWriteOptions.COMPRESSION_CODEC.key(), + "snappy", + FlinkWriteOptions.COMPRESSION_LEVEL.key(), + "6")); + + assertThat(resultProperties) + .containsEntry(TableProperties.AVRO_COMPRESSION, "snappy") + .containsEntry(TableProperties.AVRO_COMPRESSION_LEVEL, "6"); + } + + @TestTemplate + public void testCompressionParquet() throws Exception { + // No override provided + Map resultProperties = + appenderProperties( + table, + SimpleDataUtil.FLINK_SCHEMA, + ImmutableMap.of(FlinkWriteOptions.WRITE_FORMAT.key(), "PARQUET")); + + if (initProperties.get(TableProperties.PARQUET_COMPRESSION) == null) { + assertThat(resultProperties) + .containsEntry( + TableProperties.PARQUET_COMPRESSION, + TableProperties.PARQUET_COMPRESSION_DEFAULT_SINCE_1_4_0) + .doesNotContainKey(TableProperties.PARQUET_COMPRESSION_LEVEL); + } else { + assertThat(resultProperties) + .containsEntry( + TableProperties.PARQUET_COMPRESSION, + initProperties.get(TableProperties.PARQUET_COMPRESSION)) + .containsEntry( + TableProperties.PARQUET_COMPRESSION_LEVEL, + initProperties.get(TableProperties.PARQUET_COMPRESSION_LEVEL)); + } + + // Override compression to snappy and some random level + resultProperties = + appenderProperties( + table, + SimpleDataUtil.FLINK_SCHEMA, + ImmutableMap.of( + FlinkWriteOptions.WRITE_FORMAT.key(), + "PARQUET", + FlinkWriteOptions.COMPRESSION_CODEC.key(), + "snappy", + FlinkWriteOptions.COMPRESSION_LEVEL.key(), + "6")); + + assertThat(resultProperties) + .containsEntry(TableProperties.PARQUET_COMPRESSION, "snappy") + .containsEntry(TableProperties.PARQUET_COMPRESSION_LEVEL, "6"); + } + + @TestTemplate + public void testCompressionOrc() throws Exception { + // No override provided + Map resultProperties = + appenderProperties( + table, + SimpleDataUtil.FLINK_SCHEMA, + ImmutableMap.of(FlinkWriteOptions.WRITE_FORMAT.key(), "ORC")); + + if (initProperties.get(TableProperties.ORC_COMPRESSION) == null) { + assertThat(resultProperties) + .containsEntry(TableProperties.ORC_COMPRESSION, TableProperties.ORC_COMPRESSION_DEFAULT) + .containsEntry( + TableProperties.ORC_COMPRESSION_STRATEGY, + TableProperties.ORC_COMPRESSION_STRATEGY_DEFAULT); + } else { + assertThat(resultProperties) + .containsEntry( + TableProperties.ORC_COMPRESSION, initProperties.get(TableProperties.ORC_COMPRESSION)) + .containsEntry( + TableProperties.ORC_COMPRESSION_STRATEGY, + initProperties.get(TableProperties.ORC_COMPRESSION_STRATEGY)); + } + + // Override compression to snappy and a different strategy + resultProperties = + appenderProperties( + table, + SimpleDataUtil.FLINK_SCHEMA, + ImmutableMap.of( + FlinkWriteOptions.WRITE_FORMAT.key(), + "ORC", + FlinkWriteOptions.COMPRESSION_CODEC.key(), + "snappy", + FlinkWriteOptions.COMPRESSION_STRATEGY.key(), + "speed")); + + assertThat(resultProperties) + .containsEntry(TableProperties.ORC_COMPRESSION, "snappy") + .containsEntry(TableProperties.ORC_COMPRESSION_STRATEGY, "speed"); + } + + private static OneInputStreamOperatorTestHarness + createIcebergStreamWriter( + Table icebergTable, ResolvedSchema flinkSchema, Map override) + throws Exception { + RowType flinkRowType = FlinkSink.toFlinkRowType(icebergTable.schema(), flinkSchema); + FlinkWriteConf flinkWriteConfig = + new FlinkWriteConf( + icebergTable, override, new org.apache.flink.configuration.Configuration()); + + IcebergStreamWriter streamWriter = + FlinkSink.createStreamWriter(() -> icebergTable, flinkWriteConfig, flinkRowType, null); + OneInputStreamOperatorTestHarness harness = + new OneInputStreamOperatorTestHarness<>(streamWriter, 1, 1, 0); + + harness.setup(); + harness.open(); + + return harness; + } + + private static Map appenderProperties( + Table table, ResolvedSchema schema, Map override) throws Exception { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter(table, schema, override)) { + testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); + + testHarness.prepareSnapshotPreBarrier(1L); + DynFields.BoundField operatorField = + DynFields.builder() + .hiddenImpl(testHarness.getOperatorFactory().getClass(), "operator") + .build(testHarness.getOperatorFactory()); + DynFields.BoundField writerField = + DynFields.builder() + .hiddenImpl(IcebergStreamWriter.class, "writer") + .build(operatorField.get()); + DynFields.BoundField appenderField = + DynFields.builder() + .hiddenImpl(BaseTaskWriter.class, "appenderFactory") + .build(writerField.get()); + DynFields.BoundField> propsField = + DynFields.builder() + .hiddenImpl(FlinkAppenderFactory.class, "props") + .build(appenderField.get()); + return propsField.get(); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java new file mode 100644 index 000000000000..a21c51c378af --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java @@ -0,0 +1,428 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.flink.SimpleDataUtil.createDelete; +import static org.apache.iceberg.flink.SimpleDataUtil.createInsert; +import static org.apache.iceberg.flink.SimpleDataUtil.createRecord; +import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateAfter; +import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateBefore; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.time.OffsetDateTime; +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.runtime.typeutils.RowDataSerializer; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.FileContent; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.RowDelta; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.TestBase; +import org.apache.iceberg.TestTables; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.StructLikeSet; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestDeltaTaskWriter extends TestBase { + + @Parameter(index = 1) + private FileFormat format; + + @Parameters(name = "formatVersion = {0}, fileFormat = {1}") + protected static List parameters() { + return Arrays.asList( + new Object[] {2, FileFormat.AVRO}, + new Object[] {2, FileFormat.ORC}, + new Object[] {2, FileFormat.PARQUET}); + } + + @Override + @BeforeEach + public void setupTable() throws IOException { + this.metadataDir = new File(tableDir, "metadata"); + } + + private int idFieldId() { + return table.schema().findField("id").fieldId(); + } + + private int dataFieldId() { + return table.schema().findField("data").fieldId(); + } + + private void testCdcEvents(boolean partitioned) throws IOException { + Set equalityFieldIds = Sets.newHashSet(idFieldId()); + TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); + taskWriterFactory.initialize(1, 1); + + // Start the 1th transaction. + TaskWriter writer = taskWriterFactory.create(); + + writer.write(createInsert(1, "aaa")); + writer.write(createInsert(2, "bbb")); + writer.write(createInsert(3, "ccc")); + + // Update <2, 'bbb'> to <2, 'ddd'> + writer.write(createUpdateBefore(2, "bbb")); // 1 pos-delete and 1 eq-delete. + writer.write(createUpdateAfter(2, "ddd")); + + // Update <1, 'aaa'> to <1, 'eee'> + writer.write(createUpdateBefore(1, "aaa")); // 1 pos-delete and 1 eq-delete. + writer.write(createUpdateAfter(1, "eee")); + + // Insert <4, 'fff'> + writer.write(createInsert(4, "fff")); + // Insert <5, 'ggg'> + writer.write(createInsert(5, "ggg")); + + // Delete <3, 'ccc'> + writer.write(createDelete(3, "ccc")); // 1 pos-delete and 1 eq-delete. + + WriteResult result = writer.complete(); + assertThat(result.dataFiles()).hasSize(partitioned ? 7 : 1); + assertThat(result.deleteFiles()).hasSize(partitioned ? 3 : 1); + commitTransaction(result); + + assertThat(actualRowSet("*")) + .isEqualTo( + expectedRowSet( + createRecord(1, "eee"), + createRecord(2, "ddd"), + createRecord(4, "fff"), + createRecord(5, "ggg"))); + + // Start the 2nd transaction. + writer = taskWriterFactory.create(); + + // Update <2, 'ddd'> to <6, 'hhh'> - (Update both key and value) + writer.write(createUpdateBefore(2, "ddd")); // 1 eq-delete + writer.write(createUpdateAfter(6, "hhh")); + + // Update <5, 'ggg'> to <5, 'iii'> + writer.write(createUpdateBefore(5, "ggg")); // 1 eq-delete + writer.write(createUpdateAfter(5, "iii")); + + // Delete <4, 'fff'> + writer.write(createDelete(4, "fff")); // 1 eq-delete. + + result = writer.complete(); + assertThat(result.dataFiles()).hasSize(partitioned ? 2 : 1); + assertThat(result.deleteFiles()).hasSize(partitioned ? 3 : 1); + commitTransaction(result); + + assertThat(actualRowSet("*")) + .isEqualTo( + expectedRowSet(createRecord(1, "eee"), createRecord(5, "iii"), createRecord(6, "hhh"))); + } + + @TestTemplate + public void testUnpartitioned() throws IOException { + createAndInitTable(false); + testCdcEvents(false); + } + + @TestTemplate + public void testPartitioned() throws IOException { + createAndInitTable(true); + testCdcEvents(true); + } + + private void testWritePureEqDeletes(boolean partitioned) throws IOException { + createAndInitTable(partitioned); + Set equalityFieldIds = Sets.newHashSet(idFieldId()); + TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); + taskWriterFactory.initialize(1, 1); + + TaskWriter writer = taskWriterFactory.create(); + writer.write(createDelete(1, "aaa")); + writer.write(createDelete(2, "bbb")); + writer.write(createDelete(3, "ccc")); + + WriteResult result = writer.complete(); + assertThat(result.dataFiles()).isEmpty(); + assertThat(result.deleteFiles()).hasSize(partitioned ? 3 : 1); + commitTransaction(result); + + assertThat(actualRowSet("*")).isEqualTo(expectedRowSet()); + } + + @TestTemplate + public void testUnpartitionedPureEqDeletes() throws IOException { + testWritePureEqDeletes(false); + } + + @TestTemplate + public void testPartitionedPureEqDeletes() throws IOException { + testWritePureEqDeletes(true); + } + + private void testAbort(boolean partitioned) throws IOException { + createAndInitTable(partitioned); + Set equalityFieldIds = Sets.newHashSet(idFieldId()); + TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); + taskWriterFactory.initialize(1, 1); + + TaskWriter writer = taskWriterFactory.create(); + for (int i = 0; i < 8_000; i += 2) { + writer.write(createUpdateBefore(i + 1, "aaa")); + writer.write(createUpdateAfter(i + 1, "aaa")); + + writer.write(createUpdateBefore(i + 2, "bbb")); + writer.write(createUpdateAfter(i + 2, "bbb")); + } + + // Assert the current data/delete file count. + List files = + Files.walk(Paths.get(tableDir.getPath(), "data")) + .filter(p -> p.toFile().isFile()) + .filter(p -> !p.toString().endsWith(".crc")) + .collect(Collectors.toList()); + assertThat(files).hasSize(partitioned ? 4 : 2); + + writer.abort(); + for (Path file : files) { + assertThat(file).doesNotExist(); + } + } + + @TestTemplate + public void testUnpartitionedAbort() throws IOException { + testAbort(false); + } + + @TestTemplate + public void testPartitionedAbort() throws IOException { + testAbort(true); + } + + @TestTemplate + public void testPartitionedTableWithDataAsKey() throws IOException { + createAndInitTable(true); + Set equalityFieldIds = Sets.newHashSet(dataFieldId()); + TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); + taskWriterFactory.initialize(1, 1); + + // Start the 1th transaction. + TaskWriter writer = taskWriterFactory.create(); + writer.write(createInsert(1, "aaa")); + writer.write(createInsert(2, "aaa")); + writer.write(createInsert(3, "bbb")); + writer.write(createInsert(4, "ccc")); + + WriteResult result = writer.complete(); + assertThat(result.dataFiles()).hasSize(3); + assertThat(result.deleteFiles()).hasSize(1); + commitTransaction(result); + + assertThat(actualRowSet("*")) + .isEqualTo( + expectedRowSet(createRecord(2, "aaa"), createRecord(3, "bbb"), createRecord(4, "ccc"))); + + // Start the 2nd transaction. + writer = taskWriterFactory.create(); + writer.write(createInsert(5, "aaa")); + writer.write(createInsert(6, "bbb")); + writer.write(createDelete(7, "ccc")); // 1 eq-delete. + + result = writer.complete(); + assertThat(result.dataFiles()).hasSize(2); + assertThat(result.deleteFiles()).hasSize(1); + commitTransaction(result); + + assertThat(actualRowSet("*")) + .isEqualTo( + expectedRowSet( + createRecord(2, "aaa"), + createRecord(5, "aaa"), + createRecord(3, "bbb"), + createRecord(6, "bbb"))); + } + + @TestTemplate + public void testPartitionedTableWithDataAndIdAsKey() throws IOException { + createAndInitTable(true); + Set equalityFieldIds = Sets.newHashSet(dataFieldId(), idFieldId()); + TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); + taskWriterFactory.initialize(1, 1); + + TaskWriter writer = taskWriterFactory.create(); + writer.write(createInsert(1, "aaa")); + writer.write(createInsert(2, "aaa")); + + writer.write(createDelete(2, "aaa")); // 1 pos-delete. + + WriteResult result = writer.complete(); + assertThat(result.dataFiles()).hasSize(1); + assertThat(result.deleteFiles()).hasSize(1); + assertThat(result.deleteFiles()[0].content()).isEqualTo(FileContent.POSITION_DELETES); + commitTransaction(result); + + assertThat(actualRowSet("*")).isEqualTo(expectedRowSet(createRecord(1, "aaa"))); + } + + @TestTemplate + public void testEqualityColumnOnCustomPrecisionTSColumn() throws IOException { + Schema tableSchema = + new Schema( + required(3, "id", Types.IntegerType.get()), + required(4, "ts", Types.TimestampType.withZone())); + RowType flinkType = + new RowType( + false, + ImmutableList.of( + new RowType.RowField("id", new IntType()), + new RowType.RowField("ts", new LocalZonedTimestampType(3)))); + + this.table = create(tableSchema, PartitionSpec.unpartitioned()); + initTable(table); + + Set equalityIds = ImmutableSet.of(table.schema().findField("ts").fieldId()); + TaskWriterFactory taskWriterFactory = createTaskWriterFactory(flinkType, equalityIds); + taskWriterFactory.initialize(1, 1); + + TaskWriter writer = taskWriterFactory.create(); + RowDataSerializer serializer = new RowDataSerializer(flinkType); + OffsetDateTime start = OffsetDateTime.now(); + writer.write( + serializer.toBinaryRow( + GenericRowData.ofKind( + RowKind.INSERT, 1, TimestampData.fromInstant(start.toInstant())))); + writer.write( + serializer.toBinaryRow( + GenericRowData.ofKind( + RowKind.INSERT, 2, TimestampData.fromInstant(start.plusSeconds(1).toInstant())))); + writer.write( + serializer.toBinaryRow( + GenericRowData.ofKind( + RowKind.DELETE, 2, TimestampData.fromInstant(start.plusSeconds(1).toInstant())))); + + WriteResult result = writer.complete(); + // One data file + assertThat(result.dataFiles()).hasSize(1); + // One eq delete file + one pos delete file + assertThat(result.deleteFiles()).hasSize(2); + assertThat( + Arrays.stream(result.deleteFiles()) + .map(ContentFile::content) + .collect(Collectors.toSet())) + .isEqualTo(Sets.newHashSet(FileContent.POSITION_DELETES, FileContent.EQUALITY_DELETES)); + commitTransaction(result); + + Record expectedRecord = GenericRecord.create(tableSchema); + expectedRecord.setField("id", 1); + int cutPrecisionNano = start.getNano() / 1000000 * 1000000; + expectedRecord.setField("ts", start.withNano(cutPrecisionNano)); + + assertThat(actualRowSet("*")).isEqualTo(expectedRowSet(expectedRecord)); + } + + private void commitTransaction(WriteResult result) { + RowDelta rowDelta = table.newRowDelta(); + Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows); + Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); + rowDelta + .validateDeletedFiles() + .validateDataFilesExist(Lists.newArrayList(result.referencedDataFiles())) + .commit(); + } + + private StructLikeSet expectedRowSet(Record... records) { + return SimpleDataUtil.expectedRowSet(table, records); + } + + private StructLikeSet actualRowSet(String... columns) throws IOException { + return SimpleDataUtil.actualRowSet(table, columns); + } + + private TaskWriterFactory createTaskWriterFactory(Set equalityFieldIds) { + return new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), + FlinkSchemaUtil.convert(table.schema()), + 128 * 1024 * 1024, + format, + table.properties(), + equalityFieldIds, + false); + } + + private TaskWriterFactory createTaskWriterFactory( + RowType flinkType, Set equalityFieldIds) { + return new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), + flinkType, + 128 * 1024 * 1024, + format, + table.properties(), + equalityFieldIds, + true); + } + + private void createAndInitTable(boolean partitioned) { + if (partitioned) { + this.table = create(SCHEMA, PartitionSpec.builderFor(SCHEMA).identity("data").build()); + } else { + this.table = create(SCHEMA, PartitionSpec.unpartitioned()); + } + + initTable(table); + } + + private void initTable(TestTables.TestTable testTable) { + testTable + .updateProperties() + .set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, String.valueOf(8 * 1024)) + .defaultFormat(format) + .commit(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java new file mode 100644 index 000000000000..dd89f43483b0 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.TestAppenderFactory; +import org.apache.iceberg.util.ArrayUtil; +import org.apache.iceberg.util.StructLikeSet; + +public class TestFlinkAppenderFactory extends TestAppenderFactory { + + private final RowType rowType = FlinkSchemaUtil.convert(SCHEMA); + + @Override + protected FileAppenderFactory createAppenderFactory( + List equalityFieldIds, Schema eqDeleteSchema, Schema posDeleteRowSchema) { + return new FlinkAppenderFactory( + table, + table.schema(), + rowType, + table.properties(), + table.spec(), + ArrayUtil.toIntArray(equalityFieldIds), + eqDeleteSchema, + posDeleteRowSchema); + } + + @Override + protected RowData createRow(Integer id, String data) { + return SimpleDataUtil.createRowData(id, data); + } + + @Override + protected StructLikeSet expectedRowSet(Iterable rows) { + StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); + for (RowData row : rows) { + RowDataWrapper wrapper = new RowDataWrapper(rowType, table.schema().asStruct()); + set.add(wrapper.wrap(row)); + } + return set; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java new file mode 100644 index 000000000000..414ee40d1357 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.io.FileWriterFactory; +import org.apache.iceberg.io.TestFileWriterFactory; +import org.apache.iceberg.util.ArrayUtil; +import org.apache.iceberg.util.StructLikeSet; + +public class TestFlinkFileWriterFactory extends TestFileWriterFactory { + + @Override + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { + return FlinkFileWriterFactory.builderFor(table) + .dataSchema(table.schema()) + .dataFileFormat(format()) + .deleteFileFormat(format()) + .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) + .equalityDeleteRowSchema(equalityDeleteRowSchema) + .positionDeleteRowSchema(positionDeleteRowSchema) + .build(); + } + + @Override + protected RowData toRow(Integer id, String data) { + return SimpleDataUtil.createRowData(id, data); + } + + @Override + protected StructLikeSet toSet(Iterable rows) { + StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); + RowType flinkType = FlinkSchemaUtil.convert(table.schema()); + for (RowData row : rows) { + RowDataWrapper wrapper = new RowDataWrapper(flinkType, table.schema().asStruct()); + set.add(wrapper.wrap(row)); + } + return set; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java new file mode 100644 index 000000000000..fc1236ed8855 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.IOException; +import java.util.List; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.Row; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestFlinkIcebergSink extends TestFlinkIcebergSinkBase { + @Parameter(index = 0) + private FileFormat format; + + @Parameter(index = 1) + private int parallelism; + + @Parameter(index = 2) + private boolean partitioned; + + @Parameter(index = 3) + private boolean isTableSchema; + + @Parameters(name = "format={0}, parallelism = {1}, partitioned = {2}, isTableSchema = {3}") + public static Object[][] parameters() { + return new Object[][] { + // Remove after the deprecation of TableSchema - BEGIN + {FileFormat.AVRO, 1, true, true}, + {FileFormat.AVRO, 1, false, true}, + {FileFormat.AVRO, 2, true, true}, + {FileFormat.AVRO, 2, false, true}, + {FileFormat.ORC, 1, true, true}, + {FileFormat.ORC, 1, false, true}, + {FileFormat.ORC, 2, true, true}, + {FileFormat.ORC, 2, false, true}, + {FileFormat.PARQUET, 1, true, true}, + {FileFormat.PARQUET, 1, false, true}, + {FileFormat.PARQUET, 2, true, true}, + {FileFormat.PARQUET, 2, false, true}, + // Remove after the deprecation of TableSchema - END + + {FileFormat.AVRO, 1, true, false}, + {FileFormat.AVRO, 1, false, false}, + {FileFormat.AVRO, 2, true, false}, + {FileFormat.AVRO, 2, false, false}, + {FileFormat.ORC, 1, true, false}, + {FileFormat.ORC, 1, false, false}, + {FileFormat.ORC, 2, true, false}, + {FileFormat.ORC, 2, false, false}, + {FileFormat.PARQUET, 1, true, false}, + {FileFormat.PARQUET, 1, false, false}, + {FileFormat.PARQUET, 2, true, false}, + {FileFormat.PARQUET, 2, false, false}, + }; + } + + @BeforeEach + public void before() throws IOException { + this.table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); + + this.env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); + + this.tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @TestTemplate + public void testWriteRowData() throws Exception { + List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); + DataStream dataStream = + env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) + .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); + + FlinkSink.forRowData(dataStream) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .append(); + + // Execute the program. + env.execute("Test Iceberg DataStream"); + + // Assert the iceberg table's records. + SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); + } + + @TestTemplate + public void testWriteRow() throws Exception { + testWriteRow(parallelism, null, DistributionMode.NONE, isTableSchema); + } + + @TestTemplate + public void testWriteRowWithFlinkSchema() throws Exception { + testWriteRow(parallelism, SimpleDataUtil.FLINK_SCHEMA, DistributionMode.NONE, isTableSchema); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java new file mode 100644 index 000000000000..55e00d39b316 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.flink.TestFixtures.DATABASE; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.util.DataFormatConverters; +import org.apache.flink.table.legacy.api.TableSchema; +import org.apache.flink.table.runtime.typeutils.ExternalTypeInfo; +import org.apache.flink.table.types.DataType; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.BoundedTestSource; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.extension.RegisterExtension; + +public class TestFlinkIcebergSinkBase { + + @RegisterExtension + public static MiniClusterExtension miniClusterResource = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + @RegisterExtension + protected static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); + + protected static final TypeInformation ROW_TYPE_INFO = + new RowTypeInfo( + SimpleDataUtil.FLINK_SCHEMA.getColumnDataTypes().stream() + .map(ExternalTypeInfo::of) + .toArray(TypeInformation[]::new)); + + protected static final DataFormatConverters.RowConverter CONVERTER = + new DataFormatConverters.RowConverter( + SimpleDataUtil.FLINK_SCHEMA.getColumnDataTypes().toArray(DataType[]::new)); + + protected TableLoader tableLoader; + protected Table table; + protected StreamExecutionEnvironment env; + + protected BoundedTestSource createBoundedSource(List rows) { + return new BoundedTestSource<>(Collections.singletonList(rows)); + } + + protected List createRows(String prefix) { + return Lists.newArrayList( + Row.of(1, prefix + "aaa"), + Row.of(1, prefix + "bbb"), + Row.of(1, prefix + "ccc"), + Row.of(2, prefix + "aaa"), + Row.of(2, prefix + "bbb"), + Row.of(2, prefix + "ccc"), + Row.of(3, prefix + "aaa"), + Row.of(3, prefix + "bbb"), + Row.of(3, prefix + "ccc")); + } + + protected List convertToRowData(List rows) { + return rows.stream().map(CONVERTER::toInternal).collect(Collectors.toList()); + } + + protected void testWriteRow( + int writerParallelism, + ResolvedSchema resolvedSchema, + DistributionMode distributionMode, + boolean isTableSchema) + throws Exception { + List rows = createRows(""); + DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); + + if (isTableSchema) { + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .tableSchema( + resolvedSchema != null ? TableSchema.fromResolvedSchema(resolvedSchema) : null) + .writeParallelism(writerParallelism) + .distributionMode(distributionMode) + .append(); + } else { + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .resolvedSchema(resolvedSchema) + .writeParallelism(writerParallelism) + .distributionMode(distributionMode) + .append(); + } + + // Execute the program. + env.execute("Test Iceberg DataStream."); + + SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); + } + + protected int partitionFiles(String partition) throws IOException { + return SimpleDataUtil.partitionDataFiles(table, ImmutableMap.of("data", partition)).size(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java new file mode 100644 index 000000000000..a77ddead3003 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.flink.TestFixtures.DATABASE; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.util.List; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.legacy.api.TableSchema; +import org.apache.flink.types.Row; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestFlinkIcebergSinkBranch extends TestFlinkIcebergSinkBase { + @RegisterExtension + public static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); + + @Parameter(index = 0) + private String formatVersion; + + @Parameter(index = 1) + private String branch; + + @Parameter(index = 2) + private boolean isTableSchema; + + private TableLoader tableLoader; + + @Parameters(name = "formatVersion = {0}, branch = {1}, isTableSchema = {2}") + public static Object[][] parameters() { + return new Object[][] { + // Remove after the deprecation of TableSchema - BEGIN + {"1", "main", true}, + {"1", "testBranch", true}, + {"2", "main", true}, + {"2", "testBranch", true}, + // Remove after the deprecation of TableSchema - END + + {"1", "main", false}, + {"1", "testBranch", false}, + {"2", "main", false}, + {"2", "testBranch", false}, + }; + } + + @BeforeEach + public void before() throws IOException { + table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + PartitionSpec.unpartitioned(), + ImmutableMap.of( + TableProperties.DEFAULT_FILE_FORMAT, + FileFormat.AVRO.name(), + TableProperties.FORMAT_VERSION, + formatVersion)); + + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100); + + tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @TestTemplate + public void testWriteRowWithFlinkSchema() throws Exception { + testWriteRow(SimpleDataUtil.FLINK_SCHEMA, DistributionMode.NONE); + verifyOtherBranchUnmodified(); + } + + private void testWriteRow(ResolvedSchema resolvedSchema, DistributionMode distributionMode) + throws Exception { + List rows = createRows(""); + DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); + + if (isTableSchema) { + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .tableSchema(TableSchema.fromResolvedSchema(resolvedSchema)) + .toBranch(branch) + .distributionMode(distributionMode) + .append(); + } else { + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .resolvedSchema(resolvedSchema) + .toBranch(branch) + .distributionMode(distributionMode) + .append(); + } + + // Execute the program. + env.execute("Test Iceberg DataStream."); + + SimpleDataUtil.assertTableRows(table, convertToRowData(rows), branch); + SimpleDataUtil.assertTableRows( + table, + ImmutableList.of(), + branch.equals(SnapshotRef.MAIN_BRANCH) ? "test-branch" : SnapshotRef.MAIN_BRANCH); + + verifyOtherBranchUnmodified(); + } + + private void verifyOtherBranchUnmodified() { + String otherBranch = + branch.equals(SnapshotRef.MAIN_BRANCH) ? "test-branch" : SnapshotRef.MAIN_BRANCH; + if (otherBranch.equals(SnapshotRef.MAIN_BRANCH)) { + assertThat(table.currentSnapshot()).isNull(); + } + + assertThat(table.snapshot(otherBranch)).isNull(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java new file mode 100644 index 000000000000..04bc5da6a9be --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java @@ -0,0 +1,602 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.sink.shuffle.StatisticsType; +import org.apache.iceberg.flink.source.BoundedTestSource; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; + +/** + * This tests the distribution mode of Flink sink. Extract them separately since it is unnecessary + * to test different file formats (Avro, Orc, Parquet) like in {@link TestFlinkIcebergSink}. + * Removing the file format dimension reduces the number of combinations from 12 to 4, which helps + * reduce test run time. + */ +@ExtendWith(ParameterizedTestExtension.class) +public class TestFlinkIcebergSinkDistributionMode extends TestFlinkIcebergSinkBase { + + @RegisterExtension + public static MiniClusterExtension miniClusterResource = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + @RegisterExtension + private static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); + + private final FileFormat format = FileFormat.PARQUET; + + @Parameter(index = 0) + private int parallelism; + + @Parameter(index = 1) + private boolean partitioned; + + @Parameter(index = 2) + private int writeParallelism; + + @Parameter(index = 3) + private boolean isTableSchema; + + @Parameters( + name = "parallelism = {0}, partitioned = {1}, writeParallelism = {2}, isTableSchema = {3}") + public static Object[][] parameters() { + return new Object[][] { + // Remove after the deprecation of TableSchema - BEGIN + {1, true, 1, true}, + {1, false, 1, true}, + {2, true, 2, true}, + {2, false, 2, true}, + {1, true, 2, true}, + {1, false, 2, true}, + // Remove after the deprecation of TableSchema - END + + {1, true, 1, false}, + {1, false, 1, false}, + {2, true, 2, false}, + {2, false, 2, false}, + {1, true, 2, false}, + {1, false, 2, false}, + }; + } + + @BeforeEach + public void before() throws IOException { + this.table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); + + this.env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(Math.max(parallelism, writeParallelism)); + + this.tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @TestTemplate + public void testShuffleByPartitionWithSchema() throws Exception { + testWriteRow(parallelism, SimpleDataUtil.FLINK_SCHEMA, DistributionMode.HASH, isTableSchema); + if (partitioned) { + assertThat(partitionFiles("aaa")).isEqualTo(1); + assertThat(partitionFiles("bbb")).isEqualTo(1); + assertThat(partitionFiles("ccc")).isEqualTo(1); + } + } + + @TestTemplate + public void testJobNoneDistributeMode() throws Exception { + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) + .commit(); + + testWriteRow(parallelism, null, DistributionMode.NONE, isTableSchema); + + if (parallelism > 1) { + if (partitioned) { + int files = partitionFiles("aaa") + partitionFiles("bbb") + partitionFiles("ccc"); + assertThat(files).isGreaterThan(3); + } + } + } + + @TestTemplate + public void testJobNullDistributionMode() throws Exception { + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) + .commit(); + + testWriteRow(parallelism, null, null, isTableSchema); + + if (partitioned) { + assertThat(partitionFiles("aaa")).isEqualTo(1); + assertThat(partitionFiles("bbb")).isEqualTo(1); + assertThat(partitionFiles("ccc")).isEqualTo(1); + } + } + + @TestTemplate + public void testPartitionWriteMode() throws Exception { + testWriteRow(parallelism, null, DistributionMode.HASH, isTableSchema); + if (partitioned) { + assertThat(partitionFiles("aaa")).isEqualTo(1); + assertThat(partitionFiles("bbb")).isEqualTo(1); + assertThat(partitionFiles("ccc")).isEqualTo(1); + } + } + + @TestTemplate + public void testOverrideWriteConfigWithUnknownDistributionMode() { + Map newProps = Maps.newHashMap(); + newProps.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), "UNRECOGNIZED"); + + List rows = createRows(""); + DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); + + FlinkSink.Builder builder = + isTableSchema + ? FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .setAll(newProps) + : FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .setAll(newProps); + + assertThatThrownBy(builder::append) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid distribution mode: UNRECOGNIZED"); + } + + @TestTemplate + public void testRangeDistributionWithoutSortOrderUnpartitioned() { + assumeThat(partitioned).isFalse(); + + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) + .commit(); + + int numOfCheckpoints = 6; + DataStream dataStream = + env.addSource( + createRangeDistributionBoundedSource(createCharRows(numOfCheckpoints, 10)), + ROW_TYPE_INFO); + FlinkSink.Builder builder = + isTableSchema + ? FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + : FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism); + + // Range distribution requires either sort order or partition spec defined + assertThatThrownBy(builder::append) + .isInstanceOf(IllegalStateException.class) + .hasMessage( + "Invalid write distribution mode: range. Need to define sort order or partition spec."); + } + + @TestTemplate + public void testRangeDistributionWithoutSortOrderPartitioned() throws Exception { + assumeThat(partitioned).isTrue(); + + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) + .commit(); + + int numOfCheckpoints = 6; + DataStream dataStream = + env.addSource( + createRangeDistributionBoundedSource(createCharRows(numOfCheckpoints, 10)), + ROW_TYPE_INFO); + FlinkSink.Builder builder = + isTableSchema + ? FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + : FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism); + + // sort based on partition columns + builder.append(); + env.execute(getClass().getSimpleName()); + + table.refresh(); + // ordered in reverse timeline from the newest snapshot to the oldest snapshot + List snapshots = Lists.newArrayList(table.snapshots().iterator()); + // only keep the snapshots with added data files + snapshots = + snapshots.stream() + .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) + .collect(Collectors.toList()); + + // Sometimes we will have more checkpoints than the bounded source if we pass the + // auto checkpoint interval. Thus producing multiple snapshots. + assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); + } + + @TestTemplate + public void testRangeDistributionWithNullValue() throws Exception { + assumeThat(partitioned).isTrue(); + + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) + .commit(); + + int numOfCheckpoints = 6; + List> charRows = createCharRows(numOfCheckpoints, 10); + charRows.add(ImmutableList.of(Row.of(1, null))); + DataStream dataStream = + env.addSource(createRangeDistributionBoundedSource(charRows), ROW_TYPE_INFO); + FlinkSink.Builder builder = + isTableSchema + ? FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + : FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism); + + // sort based on partition columns + builder.append(); + env.execute(getClass().getSimpleName()); + + table.refresh(); + // ordered in reverse timeline from the newest snapshot to the oldest snapshot + List snapshots = Lists.newArrayList(table.snapshots().iterator()); + // only keep the snapshots with added data files + snapshots = + snapshots.stream() + .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) + .collect(Collectors.toList()); + + // Sometimes we will have more checkpoints than the bounded source if we pass the + // auto checkpoint interval. Thus producing multiple snapshots. + assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); + } + + @TestTemplate + public void testRangeDistributionWithSortOrder() throws Exception { + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) + .commit(); + table.replaceSortOrder().asc("data").commit(); + + int numOfCheckpoints = 6; + DataStream dataStream = + env.addSource( + createRangeDistributionBoundedSource(createCharRows(numOfCheckpoints, 10)), + ROW_TYPE_INFO); + if (isTableSchema) { + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .rangeDistributionStatisticsType(StatisticsType.Map) + .append(); + } else { + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .rangeDistributionStatisticsType(StatisticsType.Map) + .append(); + } + + env.execute(getClass().getSimpleName()); + + table.refresh(); + // ordered in reverse timeline from the newest snapshot to the oldest snapshot + List snapshots = Lists.newArrayList(table.snapshots().iterator()); + // only keep the snapshots with added data files + snapshots = + snapshots.stream() + .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) + .collect(Collectors.toList()); + + // Sometimes we will have more checkpoints than the bounded source if we pass the + // auto checkpoint interval. Thus producing multiple snapshots. + assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); + + // It takes 2 checkpoint cycle for statistics collection and application + // of the globally aggregated statistics in the range partitioner. + // The last two checkpoints should have range shuffle applied + List rangePartitionedCycles = + snapshots.subList(snapshots.size() - 2, snapshots.size()); + + if (partitioned) { + for (Snapshot snapshot : rangePartitionedCycles) { + List addedDataFiles = + Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); + // up to 26 partitions + assertThat(addedDataFiles).hasSizeLessThanOrEqualTo(26); + } + } else { + for (Snapshot snapshot : rangePartitionedCycles) { + List addedDataFiles = + Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); + // each writer task should only write one file for non-partition sort column + assertThat(addedDataFiles).hasSize(writeParallelism); + // verify there is no overlap in min-max stats range + if (parallelism > 1) { + assertIdColumnStatsNoRangeOverlap(addedDataFiles.get(0), addedDataFiles.get(1)); + } + } + } + } + + @TestTemplate + public void testRangeDistributionSketchWithSortOrder() throws Exception { + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) + .commit(); + table.replaceSortOrder().asc("id").commit(); + + int numOfCheckpoints = 6; + DataStream dataStream = + env.addSource( + createRangeDistributionBoundedSource(createIntRows(numOfCheckpoints, 1_000)), + ROW_TYPE_INFO); + if (isTableSchema) { + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .rangeDistributionStatisticsType(StatisticsType.Sketch) + .append(); + } else { + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .rangeDistributionStatisticsType(StatisticsType.Sketch) + .append(); + } + + env.execute(getClass().getSimpleName()); + + table.refresh(); + // ordered in reverse timeline from the newest snapshot to the oldest snapshot + List snapshots = Lists.newArrayList(table.snapshots().iterator()); + // only keep the snapshots with added data files + snapshots = + snapshots.stream() + .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) + .collect(Collectors.toList()); + + // Sometimes we will have more checkpoints than the bounded source if we pass the + // auto checkpoint interval. Thus producing multiple snapshots. + assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); + + // It takes 2 checkpoint cycle for statistics collection and application + // of the globally aggregated statistics in the range partitioner. + // The last two checkpoints should have range shuffle applied + List rangePartitionedCycles = + snapshots.subList(snapshots.size() - 2, snapshots.size()); + + // since the input has a single value for the data column, + // it is always the same partition. Hence there is no difference + // for partitioned or not + for (Snapshot snapshot : rangePartitionedCycles) { + List addedDataFiles = + Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); + // each writer task should only write one file for non-partition sort column + assertThat(addedDataFiles).hasSize(writeParallelism); + // verify there is no overlap in min-max stats range + if (writeParallelism > 2) { + assertIdColumnStatsNoRangeOverlap(addedDataFiles.get(0), addedDataFiles.get(1)); + } + } + } + + /** Test migration from Map stats to Sketch stats */ + @TestTemplate + public void testRangeDistributionStatisticsMigration() throws Exception { + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) + .commit(); + table.replaceSortOrder().asc("id").commit(); + + int numOfCheckpoints = 6; + List> rowsPerCheckpoint = Lists.newArrayListWithCapacity(numOfCheckpoints); + for (int checkpointId = 0; checkpointId < numOfCheckpoints; ++checkpointId) { + // checkpointId 2 would emit 11_000 records which is larger than + // the OPERATOR_SKETCH_SWITCH_THRESHOLD of 10_000. + // This should trigger the stats migration. + int maxId = checkpointId < 2 ? 1_000 : 11_000; + List rows = Lists.newArrayListWithCapacity(maxId); + for (int j = 0; j < maxId; ++j) { + // fixed value "a" for the data (possible partition column) + rows.add(Row.of(j, "a")); + } + + rowsPerCheckpoint.add(rows); + } + + DataStream dataStream = + env.addSource(createRangeDistributionBoundedSource(rowsPerCheckpoint), ROW_TYPE_INFO); + if (isTableSchema) { + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .rangeDistributionStatisticsType(StatisticsType.Auto) + .append(); + } else { + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .rangeDistributionStatisticsType(StatisticsType.Auto) + .append(); + } + + env.execute(getClass().getSimpleName()); + + table.refresh(); + // ordered in reverse timeline from the newest snapshot to the oldest snapshot + List snapshots = Lists.newArrayList(table.snapshots().iterator()); + // only keep the snapshots with added data files + snapshots = + snapshots.stream() + .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) + .collect(Collectors.toList()); + + // Sometimes we will have more checkpoints than the bounded source if we pass the + // auto checkpoint interval. Thus producing multiple snapshots. + assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); + + // It takes 2 checkpoint cycle for statistics collection and application + // of the globally aggregated statistics in the range partitioner. + // The last two checkpoints should have range shuffle applied + List rangePartitionedCycles = + snapshots.subList(snapshots.size() - 2, snapshots.size()); + + // since the input has a single value for the data column, + // it is always the same partition. Hence there is no difference + // for partitioned or not + for (Snapshot snapshot : rangePartitionedCycles) { + List addedDataFiles = + Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); + // each writer task should only write one file for non-partition sort column + // sometimes + assertThat(addedDataFiles).hasSize(writeParallelism); + // verify there is no overlap in min-max stats range + if (writeParallelism > 1) { + assertIdColumnStatsNoRangeOverlap(addedDataFiles.get(0), addedDataFiles.get(1)); + } + } + } + + private BoundedTestSource createRangeDistributionBoundedSource( + List> rowsPerCheckpoint) { + return new BoundedTestSource<>(rowsPerCheckpoint); + } + + private List> createCharRows(int numOfCheckpoints, int countPerChar) { + List> rowsPerCheckpoint = Lists.newArrayListWithCapacity(numOfCheckpoints); + for (int checkpointId = 0; checkpointId < numOfCheckpoints; ++checkpointId) { + List rows = Lists.newArrayListWithCapacity(26 * countPerChar); + for (int j = 0; j < countPerChar; ++j) { + for (char c = 'a'; c <= 'z'; ++c) { + rows.add(Row.of(1, String.valueOf(c))); + } + } + + rowsPerCheckpoint.add(rows); + } + + return rowsPerCheckpoint; + } + + private List> createIntRows(int numOfCheckpoints, int maxId) { + List> rowsPerCheckpoint = Lists.newArrayListWithCapacity(numOfCheckpoints); + for (int checkpointId = 0; checkpointId < numOfCheckpoints; ++checkpointId) { + List rows = Lists.newArrayListWithCapacity(maxId); + for (int j = 0; j < maxId; ++j) { + // fixed value "a" for the data (possible partition column) + rows.add(Row.of(j, "a")); + } + + rowsPerCheckpoint.add(rows); + } + + return rowsPerCheckpoint; + } + + private void assertIdColumnStatsNoRangeOverlap(DataFile file1, DataFile file2) { + // id column has fieldId 1 + int file1LowerBound = + Conversions.fromByteBuffer(Types.IntegerType.get(), file1.lowerBounds().get(1)); + int file1UpperBound = + Conversions.fromByteBuffer(Types.IntegerType.get(), file1.upperBounds().get(1)); + int file2LowerBound = + Conversions.fromByteBuffer(Types.IntegerType.get(), file2.lowerBounds().get(1)); + int file2UpperBound = + Conversions.fromByteBuffer(Types.IntegerType.get(), file2.upperBounds().get(1)); + + if (file1LowerBound < file2LowerBound) { + assertThat(file1UpperBound).isLessThanOrEqualTo(file2LowerBound); + } else { + assertThat(file2UpperBound).isLessThanOrEqualTo(file1LowerBound); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java new file mode 100644 index 000000000000..018b877a0115 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java @@ -0,0 +1,250 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.Row; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +/** + * This class tests the more extended features of Flink sink. Extract them separately since it is + * unnecessary to test all the parameters combinations in {@link TestFlinkIcebergSink}. Each test + * method in {@link TestFlinkIcebergSink} runs 12 combinations, which are expensive and slow. + */ +@ExtendWith(ParameterizedTestExtension.class) +public class TestFlinkIcebergSinkExtended extends TestFlinkIcebergSinkBase { + private final boolean partitioned = true; + private final int parallelism = 2; + private final FileFormat format = FileFormat.PARQUET; + + @Parameter private boolean isTableSchema; + + @Parameters(name = "isTableSchema={0}") + private static Object[][] parameters() { + return new Object[][] {{true}, {false}}; + } + + @BeforeEach + public void before() throws IOException { + this.table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); + + this.env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); + + this.tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @TestTemplate + public void testTwoSinksInDisjointedDAG() throws Exception { + Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); + + Table leftTable = + CATALOG_EXTENSION + .catalog() + .createTable( + TableIdentifier.of("left"), + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + props); + TableLoader leftTableLoader = + TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), TableIdentifier.of("left")); + + Table rightTable = + CATALOG_EXTENSION + .catalog() + .createTable( + TableIdentifier.of("right"), + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + props); + TableLoader rightTableLoader = + TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), TableIdentifier.of("right")); + + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); + env.getConfig().disableAutoGeneratedUIDs(); + + List leftRows = createRows("left-"); + DataStream leftStream = + env.addSource(createBoundedSource(leftRows), ROW_TYPE_INFO) + .name("leftCustomSource") + .uid("leftCustomSource"); + if (isTableSchema) { + FlinkSink.forRow(leftStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(leftTable) + .tableLoader(leftTableLoader) + .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) + .distributionMode(DistributionMode.NONE) + .uidPrefix("leftIcebergSink") + .append(); + } else { + FlinkSink.forRow(leftStream, SimpleDataUtil.FLINK_SCHEMA) + .table(leftTable) + .tableLoader(leftTableLoader) + .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) + .distributionMode(DistributionMode.NONE) + .uidPrefix("leftIcebergSink") + .append(); + } + + List rightRows = createRows("right-"); + DataStream rightStream = + env.addSource(createBoundedSource(rightRows), ROW_TYPE_INFO) + .name("rightCustomSource") + .uid("rightCustomSource"); + if (isTableSchema) { + FlinkSink.forRow(rightStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(rightTable) + .tableLoader(rightTableLoader) + .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) + .writeParallelism(parallelism) + .distributionMode(DistributionMode.HASH) + .uidPrefix("rightIcebergSink") + .setSnapshotProperty("flink.test", TestFlinkIcebergSink.class.getName()) + .setSnapshotProperties(Collections.singletonMap("direction", "rightTable")) + .append(); + } else { + FlinkSink.forRow(rightStream, SimpleDataUtil.FLINK_SCHEMA) + .table(rightTable) + .tableLoader(rightTableLoader) + .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) + .writeParallelism(parallelism) + .distributionMode(DistributionMode.HASH) + .uidPrefix("rightIcebergSink") + .setSnapshotProperty("flink.test", TestFlinkIcebergSink.class.getName()) + .setSnapshotProperties(Collections.singletonMap("direction", "rightTable")) + .append(); + } + + // Execute the program. + env.execute("Test Iceberg DataStream."); + + SimpleDataUtil.assertTableRows(leftTable, convertToRowData(leftRows)); + SimpleDataUtil.assertTableRows(rightTable, convertToRowData(rightRows)); + + leftTable.refresh(); + assertThat(leftTable.currentSnapshot().summary()).doesNotContainKeys("flink.test", "direction"); + rightTable.refresh(); + assertThat(rightTable.currentSnapshot().summary()) + .containsEntry("flink.test", TestFlinkIcebergSink.class.getName()) + .containsEntry("direction", "rightTable"); + } + + @TestTemplate + public void testOverrideWriteConfigWithUnknownFileFormat() { + Map newProps = Maps.newHashMap(); + newProps.put(FlinkWriteOptions.WRITE_FORMAT.key(), "UNRECOGNIZED"); + + List rows = createRows(""); + DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); + + FlinkSink.Builder builder = + isTableSchema + ? FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .setAll(newProps) + : FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .setAll(newProps); + + assertThatThrownBy(builder::append) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid file format: UNRECOGNIZED"); + } + + @Test + public void testWriteRowWithTableRefreshInterval() throws Exception { + List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); + DataStream dataStream = + env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) + .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); + + Configuration flinkConf = new Configuration(); + flinkConf.setString(FlinkWriteOptions.TABLE_REFRESH_INTERVAL.key(), "100ms"); + + FlinkSink.forRowData(dataStream) + .table(table) + .tableLoader(tableLoader) + .flinkConf(flinkConf) + .writeParallelism(parallelism) + .append(); + + // Execute the program. + env.execute("Test Iceberg DataStream"); + + // Assert the iceberg table's records. + SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkRangeDistributionBucketing.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkRangeDistributionBucketing.java new file mode 100644 index 000000000000..a5799288b5e3 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkRangeDistributionBucketing.java @@ -0,0 +1,255 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.expressions.Expressions.bucket; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.UUID; +import java.util.stream.Collectors; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.connector.datagen.source.DataGeneratorSource; +import org.apache.flink.connector.datagen.source.GeneratorFunction; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.api.extension.RegisterExtension; + +/** + * Test range distribution with bucketing partition column. Compared to hash distribution, range + * distribution is more general to handle bucketing column while achieving even distribution of + * traffic to writer tasks. + * + *
      + *
    • keyBy on low cardinality (e.g. + * 60) may not achieve balanced data distribution. + *
    • number of buckets (e.g. 60) is not divisible by the writer parallelism (e.g. 40). + *
    • number of buckets (e.g. 60) is smaller than the writer parallelism (e.g. 120). + *
    + */ +@Timeout(value = 30) +@Disabled // https://github.com/apache/iceberg/pull/11305#issuecomment-2415207097 +public class TestFlinkIcebergSinkRangeDistributionBucketing { + private static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = + new Configuration() + // disable classloader check as Avro may cache class/object in the serializers. + .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); + + // max supported parallelism is 16 (= 4 x 4) + @RegisterExtension + public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = + new MiniClusterExtension( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(4) + .setNumberSlotsPerTaskManager(4) + .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) + .build()); + + @RegisterExtension + private static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); + + private static final int NUM_BUCKETS = 4; + private static final int NUM_OF_CHECKPOINTS = 6; + private static final int ROW_COUNT_PER_CHECKPOINT = 200; + private static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(2, "uuid", Types.UUIDType.get()), + Types.NestedField.optional(3, "data", Types.StringType.get())); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).hour("ts").bucket("uuid", NUM_BUCKETS).build(); + private static final RowType ROW_TYPE = FlinkSchemaUtil.convert(SCHEMA); + + private TableLoader tableLoader; + private Table table; + + @BeforeEach + public void before() throws IOException { + this.tableLoader = CATALOG_EXTENSION.tableLoader(); + this.table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SCHEMA, + SPEC, + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, FileFormat.PARQUET.name())); + + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) + .commit(); + + // Assuming ts is on ingestion/processing time. Writer only writes to 1 or 2 hours concurrently. + // Only sort on the bucket column to avoid each writer task writes to 60 buckets/files + // concurrently. + table.replaceSortOrder().asc(bucket("uuid", NUM_BUCKETS)).commit(); + } + + @AfterEach + public void after() throws Exception { + CATALOG_EXTENSION.catalog().dropTable(TestFixtures.TABLE_IDENTIFIER); + } + + /** number of buckets 4 matches writer parallelism of 4 */ + @Test + public void testBucketNumberEqualsToWriterParallelism() throws Exception { + testParallelism(4); + } + + /** number of buckets 4 is less than writer parallelism of 6 */ + @Test + public void testBucketNumberLessThanWriterParallelismNotDivisible() throws Exception { + testParallelism(6); + } + + /** number of buckets 4 is less than writer parallelism of 8 */ + @Test + public void testBucketNumberLessThanWriterParallelismDivisible() throws Exception { + testParallelism(8); + } + + /** number of buckets 4 is greater than writer parallelism of 3 */ + @Test + public void testBucketNumberHigherThanWriterParallelismNotDivisible() throws Exception { + testParallelism(3); + } + + /** number of buckets 4 is greater than writer parallelism of 2 */ + @Test + public void testBucketNumberHigherThanWriterParallelismDivisible() throws Exception { + testParallelism(2); + } + + private void testParallelism(int parallelism) throws Exception { + try (StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism)) { + + DataGeneratorSource generatorSource = + new DataGeneratorSource<>( + new RowGenerator(), + ROW_COUNT_PER_CHECKPOINT * NUM_OF_CHECKPOINTS, + RateLimiterStrategy.perCheckpoint(ROW_COUNT_PER_CHECKPOINT), + FlinkCompatibilityUtil.toTypeInfo(ROW_TYPE)); + DataStream dataStream = + env.fromSource(generatorSource, WatermarkStrategy.noWatermarks(), "Data Generator"); + + FlinkSink.forRowData(dataStream) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .append(); + env.execute(getClass().getSimpleName()); + + table.refresh(); + // ordered in reverse timeline from the oldest snapshot to the newest snapshot + List snapshots = Lists.newArrayList(table.snapshots().iterator()); + // only keep the snapshots with added data files + snapshots = + snapshots.stream() + .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) + .collect(Collectors.toList()); + + // Source rate limit per checkpoint cycle may not be super precise. + // There could be more checkpoint cycles and commits than planned. + assertThat(snapshots).hasSizeGreaterThanOrEqualTo(NUM_OF_CHECKPOINTS); + + // It takes 2 checkpoint cycle for statistics collection and application + // of the globally aggregated statistics in the range partitioner. + // The last two checkpoints should have range shuffle applied + List rangePartitionedCycles = + snapshots.subList(snapshots.size() - 2, snapshots.size()); + + for (Snapshot snapshot : rangePartitionedCycles) { + List addedDataFiles = + Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); + assertThat(addedDataFiles) + .hasSizeLessThanOrEqualTo(maxAddedDataFilesPerCheckpoint(parallelism)); + } + } + } + + /** + * Traffic is not perfectly balanced across all buckets in the small sample size Range + * distribution of the bucket id may cross subtask boundary. Hence the number of committed data + * files per checkpoint maybe larger than writer parallelism or the number of buckets. But it + * should not be more than the sum of those two. Without range distribution, the number of data + * files per commit can be 4x of parallelism (as the number of buckets is 4). + */ + private int maxAddedDataFilesPerCheckpoint(int parallelism) { + return NUM_BUCKETS + parallelism; + } + + private static class RowGenerator implements GeneratorFunction { + // use constant timestamp so that all rows go to the same hourly partition + private final long ts = System.currentTimeMillis(); + + @Override + public RowData map(Long index) throws Exception { + // random uuid should result in relatively balanced distribution across buckets + UUID uuid = UUID.randomUUID(); + ByteBuffer uuidByteBuffer = ByteBuffer.allocate(16); + uuidByteBuffer.putLong(uuid.getMostSignificantBits()); + uuidByteBuffer.putLong(uuid.getLeastSignificantBits()); + return GenericRowData.of( + TimestampData.fromEpochMillis(ts), + uuidByteBuffer.array(), + StringData.fromString("row-" + index)); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java new file mode 100644 index 000000000000..ffd40b6cdc95 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java @@ -0,0 +1,254 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.flink.TestFixtures.DATABASE; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.util.List; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.MetadataColumns; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.BoundedTestSource; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; + +@ExtendWith(ParameterizedTestExtension.class) +@Timeout(value = 60) +public class TestFlinkIcebergSinkV2 extends TestFlinkIcebergSinkV2Base { + @RegisterExtension + public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + @RegisterExtension + private static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); + + @BeforeEach + public void setupTable() { + table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + ImmutableMap.of( + TableProperties.DEFAULT_FILE_FORMAT, + format.name(), + TableProperties.FORMAT_VERSION, + String.valueOf(FORMAT_V2))); + + table + .updateProperties() + .set(TableProperties.DEFAULT_FILE_FORMAT, format.name()) + .set(TableProperties.WRITE_DISTRIBUTION_MODE, writeDistributionMode) + .commit(); + + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100L) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); + + tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @TestTemplate + public void testCheckAndGetEqualityFieldIds() { + table + .updateSchema() + .allowIncompatibleChanges() + .addRequiredColumn("type", Types.StringType.get()) + .setIdentifierFields("type") + .commit(); + + DataStream dataStream = + env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); + FlinkSink.Builder builder = + isTableSchema + ? FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA).table(table) + : FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA).table(table); + + // Use schema identifier field IDs as equality field id list by default + assertThat(builder.checkAndGetEqualityFieldIds()) + .containsExactlyInAnyOrderElementsOf(table.schema().identifierFieldIds()); + + // Use user-provided equality field column as equality field id list + builder.equalityFieldColumns(Lists.newArrayList("id")); + assertThat(builder.checkAndGetEqualityFieldIds()) + .containsExactlyInAnyOrder(table.schema().findField("id").fieldId()); + + builder.equalityFieldColumns(Lists.newArrayList("type")); + assertThat(builder.checkAndGetEqualityFieldIds()) + .containsExactlyInAnyOrder(table.schema().findField("type").fieldId()); + } + + @TestTemplate + public void testChangeLogOnIdKey() throws Exception { + testChangeLogOnIdKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testUpsertOnlyDeletesOnDataKey() throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of(row("+I", 1, "aaa")), + ImmutableList.of(row("-D", 1, "aaa"), row("-D", 2, "bbb"))); + + List> expectedRecords = + ImmutableList.of(ImmutableList.of(record(1, "aaa")), ImmutableList.of()); + + testChangeLogs( + ImmutableList.of("data"), + row -> row.getField(ROW_DATA_POS), + true, + elementsPerCheckpoint, + expectedRecords, + SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testChangeLogOnDataKey() throws Exception { + testChangeLogOnDataKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testChangeLogOnIdDataKey() throws Exception { + testChangeLogOnIdDataKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testChangeLogOnSameKey() throws Exception { + testChangeLogOnSameKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testUpsertModeCheck() { + DataStream dataStream = + env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); + FlinkSink.Builder builder = + isTableSchema + ? FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .tableLoader(tableLoader) + .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) + .writeParallelism(parallelism) + .upsert(true) + : FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .tableLoader(tableLoader) + .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) + .writeParallelism(parallelism) + .upsert(true); + + assertThatThrownBy( + () -> + builder + .equalityFieldColumns(ImmutableList.of("id", "data")) + .overwrite(true) + .append()) + .isInstanceOf(IllegalStateException.class) + .hasMessage( + "OVERWRITE mode shouldn't be enable when configuring to use UPSERT data stream."); + + if (writeDistributionMode.equals(DistributionMode.RANGE.modeName()) && !partitioned) { + // validation error thrown from distributeDataStream + assertThatThrownBy( + () -> builder.equalityFieldColumns(ImmutableList.of()).overwrite(false).append()) + .isInstanceOf(IllegalStateException.class) + .hasMessage( + "Invalid write distribution mode: range. Need to define sort order or partition spec."); + } else { + // validation error thrown from appendWriter + assertThatThrownBy( + () -> builder.equalityFieldColumns(ImmutableList.of()).overwrite(false).append()) + .isInstanceOf(IllegalStateException.class) + .hasMessage( + "Equality field columns shouldn't be empty when configuring to use UPSERT data stream."); + } + } + + @TestTemplate + public void testUpsertOnIdKey() throws Exception { + testUpsertOnIdKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testUpsertOnDataKey() throws Exception { + testUpsertOnDataKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testUpsertOnIdDataKey() throws Exception { + testUpsertOnIdDataKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testDeleteStats() throws Exception { + assumeThat(format).isNotEqualTo(FileFormat.AVRO); + + List> elementsPerCheckpoint = + ImmutableList.of( + // Checkpoint #1 + ImmutableList.of(row("+I", 1, "aaa"), row("-D", 1, "aaa"), row("+I", 1, "aaa"))); + + List> expectedRecords = ImmutableList.of(ImmutableList.of(record(1, "aaa"))); + + testChangeLogs( + ImmutableList.of("id", "data"), + row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), + false, + elementsPerCheckpoint, + expectedRecords, + "main"); + + DeleteFile deleteFile = table.currentSnapshot().addedDeleteFiles(table.io()).iterator().next(); + String fromStat = + new String( + deleteFile.lowerBounds().get(MetadataColumns.DELETE_FILE_PATH.fieldId()).array()); + DataFile dataFile = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); + assumeThat(fromStat).isEqualTo(dataFile.location()); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java new file mode 100644 index 000000000000..12a4593d039e --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java @@ -0,0 +1,424 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.runtime.typeutils.ExternalTypeInfo; +import org.apache.flink.types.Row; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.IcebergGenerics; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.source.BoundedTestSource; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.StructLikeSet; + +class TestFlinkIcebergSinkV2Base { + + static final int FORMAT_V2 = 2; + static final TypeInformation ROW_TYPE_INFO = + new RowTypeInfo( + SimpleDataUtil.FLINK_SCHEMA.getColumnDataTypes().stream() + .map(ExternalTypeInfo::of) + .toArray(TypeInformation[]::new)); + + static final int ROW_ID_POS = 0; + static final int ROW_DATA_POS = 1; + + TableLoader tableLoader; + Table table; + StreamExecutionEnvironment env; + + @Parameter(index = 0) + FileFormat format; + + @Parameter(index = 1) + int parallelism = 1; + + @Parameter(index = 2) + boolean partitioned; + + @Parameter(index = 3) + String writeDistributionMode; + + @Parameter(index = 4) + boolean isTableSchema; + + @Parameters( + name = + "FileFormat={0}, Parallelism={1}, Partitioned={2}, WriteDistributionMode={3}, IsTableSchema={4}") + public static Object[][] parameters() { + return new Object[][] { + // Remove after the deprecation of TableSchema - BEGIN + {FileFormat.AVRO, 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, true}, + {FileFormat.AVRO, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, true}, + {FileFormat.AVRO, 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, true}, + {FileFormat.AVRO, 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, true}, + {FileFormat.ORC, 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH, true}, + {FileFormat.ORC, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH, true}, + {FileFormat.ORC, 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH, true}, + {FileFormat.ORC, 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH, true}, + {FileFormat.PARQUET, 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE, true}, + {FileFormat.PARQUET, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE, true}, + {FileFormat.PARQUET, 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE, true}, + {FileFormat.PARQUET, 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE, true}, + // Remove after the deprecation of TableSchema - END + + {FileFormat.AVRO, 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, false}, + {FileFormat.AVRO, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, false}, + {FileFormat.AVRO, 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, false}, + {FileFormat.AVRO, 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, false}, + {FileFormat.ORC, 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH, false}, + {FileFormat.ORC, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH, false}, + {FileFormat.ORC, 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH, false}, + {FileFormat.ORC, 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH, false}, + {FileFormat.PARQUET, 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE, false}, + {FileFormat.PARQUET, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE, false}, + {FileFormat.PARQUET, 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE, false}, + {FileFormat.PARQUET, 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE, false}, + }; + } + + static final Map ROW_KIND_MAP = + ImmutableMap.of( + "+I", RowKind.INSERT, + "-D", RowKind.DELETE, + "-U", RowKind.UPDATE_BEFORE, + "+U", RowKind.UPDATE_AFTER); + + Row row(String rowKind, int id, String data) { + RowKind kind = ROW_KIND_MAP.get(rowKind); + if (kind == null) { + throw new IllegalArgumentException("Unknown row kind: " + rowKind); + } + + return Row.ofKind(kind, id, data); + } + + void testUpsertOnIdDataKey(String branch) throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of(row("+I", 1, "aaa"), row("+U", 1, "aaa"), row("+I", 2, "bbb")), + ImmutableList.of(row("+I", 1, "aaa"), row("-D", 2, "bbb"), row("+I", 2, "ccc")), + ImmutableList.of(row("+U", 1, "bbb"), row("-U", 1, "ccc"), row("-D", 1, "aaa"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(record(1, "aaa"), record(2, "bbb")), + ImmutableList.of(record(1, "aaa"), record(2, "ccc")), + ImmutableList.of(record(1, "bbb"), record(2, "ccc"))); + testChangeLogs( + ImmutableList.of("id", "data"), + row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), + true, + elementsPerCheckpoint, + expectedRecords, + branch); + } + + void testChangeLogOnIdDataKey(String branch) throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of( + row("+I", 1, "aaa"), + row("-D", 1, "aaa"), + row("+I", 2, "bbb"), + row("+I", 1, "bbb"), + row("+I", 2, "aaa")), + ImmutableList.of(row("-U", 2, "aaa"), row("+U", 1, "ccc"), row("+I", 1, "aaa")), + ImmutableList.of(row("-D", 1, "bbb"), row("+I", 2, "aaa"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(record(1, "bbb"), record(2, "aaa"), record(2, "bbb")), + ImmutableList.of( + record(1, "aaa"), record(1, "bbb"), record(1, "ccc"), record(2, "bbb")), + ImmutableList.of( + record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "bbb"))); + + testChangeLogs( + ImmutableList.of("data", "id"), + row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), + false, + elementsPerCheckpoint, + expectedRecords, + branch); + } + + void testChangeLogOnSameKey(String branch) throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + // Checkpoint #1 + ImmutableList.of(row("+I", 1, "aaa"), row("-D", 1, "aaa"), row("+I", 1, "aaa")), + // Checkpoint #2 + ImmutableList.of(row("-U", 1, "aaa"), row("+U", 1, "aaa")), + // Checkpoint #3 + ImmutableList.of(row("-D", 1, "aaa"), row("+I", 1, "aaa")), + // Checkpoint #4 + ImmutableList.of(row("-U", 1, "aaa"), row("+U", 1, "aaa"), row("+I", 1, "aaa"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(record(1, "aaa")), + ImmutableList.of(record(1, "aaa")), + ImmutableList.of(record(1, "aaa")), + ImmutableList.of(record(1, "aaa"), record(1, "aaa"))); + + testChangeLogs( + ImmutableList.of("id", "data"), + row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), + false, + elementsPerCheckpoint, + expectedRecords, + branch); + } + + void testChangeLogOnDataKey(String branch) throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of( + row("+I", 1, "aaa"), + row("-D", 1, "aaa"), + row("+I", 2, "bbb"), + row("+I", 1, "bbb"), + row("+I", 2, "aaa")), + ImmutableList.of(row("-U", 2, "aaa"), row("+U", 1, "ccc"), row("+I", 1, "aaa")), + ImmutableList.of(row("-D", 1, "bbb"), row("+I", 2, "aaa"), row("+I", 2, "ccc"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(record(1, "bbb"), record(2, "aaa")), + ImmutableList.of(record(1, "aaa"), record(1, "bbb"), record(1, "ccc")), + ImmutableList.of( + record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "ccc"))); + + testChangeLogs( + ImmutableList.of("data"), + row -> row.getField(ROW_DATA_POS), + false, + elementsPerCheckpoint, + expectedRecords, + branch); + } + + void testUpsertOnDataKey(String branch) throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of(row("+I", 1, "aaa"), row("+I", 2, "aaa"), row("+I", 3, "bbb")), + ImmutableList.of(row("+U", 4, "aaa"), row("-U", 3, "bbb"), row("+U", 5, "bbb")), + ImmutableList.of(row("+I", 6, "aaa"), row("+U", 7, "bbb"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(record(2, "aaa"), record(3, "bbb")), + ImmutableList.of(record(4, "aaa"), record(5, "bbb")), + ImmutableList.of(record(6, "aaa"), record(7, "bbb"))); + + testChangeLogs( + ImmutableList.of("data"), + row -> row.getField(ROW_DATA_POS), + true, + elementsPerCheckpoint, + expectedRecords, + branch); + } + + void testChangeLogOnIdKey(String branch) throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of( + row("+I", 1, "aaa"), + row("-D", 1, "aaa"), + row("+I", 1, "bbb"), + row("+I", 2, "aaa"), + row("-D", 2, "aaa"), + row("+I", 2, "bbb")), + ImmutableList.of( + row("-U", 2, "bbb"), row("+U", 2, "ccc"), row("-D", 2, "ccc"), row("+I", 2, "ddd")), + ImmutableList.of( + row("-D", 1, "bbb"), + row("+I", 1, "ccc"), + row("-D", 1, "ccc"), + row("+I", 1, "ddd"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(record(1, "bbb"), record(2, "bbb")), + ImmutableList.of(record(1, "bbb"), record(2, "ddd")), + ImmutableList.of(record(1, "ddd"), record(2, "ddd"))); + + if (partitioned && writeDistributionMode.equals(TableProperties.WRITE_DISTRIBUTION_MODE_HASH)) { + assertThatThrownBy( + () -> + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + false, + elementsPerCheckpoint, + expectedRecords, + branch)) + .isInstanceOf(IllegalStateException.class) + .hasMessageStartingWith( + "In 'hash' distribution mode with equality fields set, source column") + .hasMessageContaining("should be included in equality fields:"); + + } else { + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + false, + elementsPerCheckpoint, + expectedRecords, + branch); + } + } + + void testUpsertOnIdKey(String branch) throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of(row("+I", 1, "aaa"), row("+U", 1, "bbb")), + ImmutableList.of(row("+I", 1, "ccc")), + ImmutableList.of(row("+U", 1, "ddd"), row("+I", 1, "eee"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(record(1, "bbb")), + ImmutableList.of(record(1, "ccc")), + ImmutableList.of(record(1, "eee"))); + + if (!partitioned) { + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + true, + elementsPerCheckpoint, + expectedRecords, + branch); + } else { + assertThatThrownBy( + () -> + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + true, + elementsPerCheckpoint, + expectedRecords, + branch)) + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("should be included in equality fields:"); + } + } + + void testChangeLogs( + List equalityFieldColumns, + KeySelector keySelector, + boolean insertAsUpsert, + List> elementsPerCheckpoint, + List> expectedRecordsPerCheckpoint, + String branch) + throws Exception { + DataStream dataStream = + env.addSource(new BoundedTestSource<>(elementsPerCheckpoint), ROW_TYPE_INFO); + + if (isTableSchema) { + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .tableLoader(tableLoader) + .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) + .writeParallelism(parallelism) + .equalityFieldColumns(equalityFieldColumns) + .upsert(insertAsUpsert) + .toBranch(branch) + .append(); + } else { + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .tableLoader(tableLoader) + .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) + .writeParallelism(parallelism) + .equalityFieldColumns(equalityFieldColumns) + .upsert(insertAsUpsert) + .toBranch(branch) + .append(); + } + + // Execute the program. + env.execute("Test Iceberg Change-Log DataStream."); + + table.refresh(); + List snapshots = findValidSnapshots(); + int expectedSnapshotNum = expectedRecordsPerCheckpoint.size(); + assertThat(snapshots).hasSize(expectedSnapshotNum); + + for (int i = 0; i < expectedSnapshotNum; i++) { + long snapshotId = snapshots.get(i).snapshotId(); + List expectedRecords = expectedRecordsPerCheckpoint.get(i); + assertThat(actualRowSet(snapshotId, "*")) + .as("Should have the expected records for the checkpoint#" + i) + .isEqualTo(expectedRowSet(expectedRecords.toArray(new Record[0]))); + } + } + + Record record(int id, String data) { + return SimpleDataUtil.createRecord(id, data); + } + + List findValidSnapshots() { + List validSnapshots = Lists.newArrayList(); + for (Snapshot snapshot : table.snapshots()) { + if (snapshot.allManifests(table.io()).stream() + .anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { + validSnapshots.add(snapshot); + } + } + return validSnapshots; + } + + StructLikeSet expectedRowSet(Record... records) { + return SimpleDataUtil.expectedRowSet(table, records); + } + + StructLikeSet actualRowSet(long snapshotId, String... columns) throws IOException { + table.refresh(); + StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); + try (CloseableIterable reader = + IcebergGenerics.read(table).useSnapshot(snapshotId).select(columns).build()) { + reader.forEach(set::add); + } + return set; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java new file mode 100644 index 000000000000..8ce3e1886f40 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.flink.TestFixtures.DATABASE; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestFlinkIcebergSinkV2Branch extends TestFlinkIcebergSinkV2Base { + @RegisterExtension + static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); + + @Parameter(index = 5) + protected String branch; + + @Parameters( + name = + "FileFormat={0}, Parallelism={1}, Partitioned={2}, WriteDistributionMode={3}, IsTableSchema={4}, Branch={5}") + public static Object[][] parameters() { + return new Object[][] { + // Remove after the deprecation of TableSchema - BEGIN + {FileFormat.AVRO, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, true, "main"}, + {FileFormat.AVRO, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, true, "testBranch"}, + // Remove after the deprecation of TableSchema - END + + {FileFormat.AVRO, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, false, "main"}, + { + FileFormat.AVRO, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, false, "testBranch" + }, + }; + } + + @BeforeEach + public void before() throws IOException { + table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + PartitionSpec.unpartitioned(), + ImmutableMap.of( + TableProperties.DEFAULT_FILE_FORMAT, + FileFormat.AVRO.name(), + TableProperties.FORMAT_VERSION, + "2")); + + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100); + + tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @TestTemplate + public void testChangeLogOnIdKey() throws Exception { + testChangeLogOnIdKey(branch); + verifyOtherBranchUnmodified(); + } + + @TestTemplate + public void testChangeLogOnDataKey() throws Exception { + testChangeLogOnDataKey(branch); + verifyOtherBranchUnmodified(); + } + + @TestTemplate + public void testChangeLogOnIdDataKey() throws Exception { + testChangeLogOnIdDataKey(branch); + verifyOtherBranchUnmodified(); + } + + @TestTemplate + public void testUpsertOnIdKey() throws Exception { + testUpsertOnIdKey(branch); + verifyOtherBranchUnmodified(); + } + + @TestTemplate + public void testUpsertOnDataKey() throws Exception { + testUpsertOnDataKey(branch); + verifyOtherBranchUnmodified(); + } + + @TestTemplate + public void testUpsertOnIdDataKey() throws Exception { + testUpsertOnIdDataKey(branch); + verifyOtherBranchUnmodified(); + } + + private void verifyOtherBranchUnmodified() { + String otherBranch = + branch.equals(SnapshotRef.MAIN_BRANCH) ? "test-branch" : SnapshotRef.MAIN_BRANCH; + if (otherBranch.equals(SnapshotRef.MAIN_BRANCH)) { + assertThat(table.currentSnapshot()).isNull(); + } + + assertThat(table.snapshot(otherBranch)).isNull(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2DistributionMode.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2DistributionMode.java new file mode 100644 index 000000000000..0feb4cc282d2 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2DistributionMode.java @@ -0,0 +1,618 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.sink.shuffle.StatisticsType; +import org.apache.iceberg.flink.source.BoundedTestSource; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; + +/** + * This tests the distribution mode of the IcebergSink. Extract them separately since it is + * unnecessary to test different file formats (Avro, Orc, Parquet) like in {@link TestIcebergSink}. + * Removing the file format dimension reduces the number of combinations from 12 to 4, which helps + * reduce test run time. + */ +@ExtendWith(ParameterizedTestExtension.class) +public class TestFlinkIcebergSinkV2DistributionMode extends TestFlinkIcebergSinkBase { + + @RegisterExtension + public static MiniClusterExtension miniClusterResource = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + @RegisterExtension + private static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); + + private final FileFormat format = FileFormat.PARQUET; + + @Parameter(index = 0) + private int parallelism; + + @Parameter(index = 1) + private boolean partitioned; + + @Parameter(index = 2) + private int writeParallelism; + + @Parameter(index = 3) + private boolean isTableSchema; + + @Parameters( + name = "parallelism = {0}, partitioned = {1}, writeParallelism = {2}, isTableSchema = {3}") + public static Object[][] parameters() { + return new Object[][] { + // Remove after the deprecation of TableSchema - BEGIN + {1, true, 1, true}, + {1, false, 1, true}, + {2, true, 2, true}, + {2, false, 2, true}, + {1, true, 2, true}, + {1, false, 2, true}, + // Remove after the deprecation of TableSchema - END + + {1, true, 1, false}, + {1, false, 1, false}, + {2, true, 2, false}, + {2, false, 2, false}, + {1, true, 2, false}, + {1, false, 2, false}, + }; + } + + @BeforeEach + public void before() throws IOException { + this.table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); + + this.env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(Math.max(parallelism, writeParallelism)); + + this.tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @TestTemplate + public void testShuffleByPartitionWithSchema() throws Exception { + testWriteRow(parallelism, SimpleDataUtil.FLINK_SCHEMA, DistributionMode.HASH, isTableSchema); + if (partitioned) { + assertThat(partitionFiles("aaa")).isEqualTo(1); + assertThat(partitionFiles("bbb")).isEqualTo(1); + assertThat(partitionFiles("ccc")).isEqualTo(1); + } + } + + @TestTemplate + public void testJobNoneDistributeMode() throws Exception { + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) + .commit(); + + testWriteRow(parallelism, null, DistributionMode.NONE, isTableSchema); + + if (parallelism > 1) { + if (partitioned) { + int files = partitionFiles("aaa") + partitionFiles("bbb") + partitionFiles("ccc"); + assertThat(files).isGreaterThan(3); + } + } + } + + @TestTemplate + public void testJobNullDistributionMode() throws Exception { + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) + .commit(); + + testWriteRow(parallelism, null, null, isTableSchema); + + if (partitioned) { + assertThat(partitionFiles("aaa")).isEqualTo(1); + assertThat(partitionFiles("bbb")).isEqualTo(1); + assertThat(partitionFiles("ccc")).isEqualTo(1); + } + } + + @TestTemplate + public void testPartitionWriteMode() throws Exception { + testWriteRow(parallelism, null, DistributionMode.HASH, isTableSchema); + if (partitioned) { + assertThat(partitionFiles("aaa")).isEqualTo(1); + assertThat(partitionFiles("bbb")).isEqualTo(1); + assertThat(partitionFiles("ccc")).isEqualTo(1); + } + } + + @TestTemplate + public void testOverrideWriteConfigWithUnknownDistributionMode() { + Map newProps = Maps.newHashMap(); + newProps.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), "UNRECOGNIZED"); + + List rows = createRows(""); + DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); + + if (isTableSchema) { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .setAll(newProps) + .append(); + } else { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .setAll(newProps) + .append(); + } + + assertThatThrownBy(env::execute) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid distribution mode: UNRECOGNIZED"); + } + + @TestTemplate + public void testRangeDistributionWithoutSortOrderUnpartitioned() throws Exception { + assumeThat(partitioned).isFalse(); + + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) + .commit(); + + int numOfCheckpoints = 6; + DataStream dataStream = + env.addSource( + createRangeDistributionBoundedSource(createCharRows(numOfCheckpoints, 10)), + ROW_TYPE_INFO); + + if (isTableSchema) { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .append(); + } else { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .append(); + } + + // Range distribution requires either sort order or partition spec defined + assertThatThrownBy(env::execute) + .isInstanceOf(IllegalStateException.class) + .hasMessage( + "Invalid write distribution mode: range. Need to define sort order or partition spec."); + } + + @TestTemplate + public void testRangeDistributionWithoutSortOrderPartitioned() throws Exception { + assumeThat(partitioned).isTrue(); + + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) + .commit(); + + int numOfCheckpoints = 6; + DataStream dataStream = + env.addSource( + createRangeDistributionBoundedSource(createCharRows(numOfCheckpoints, 10)), + ROW_TYPE_INFO); + + if (isTableSchema) { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .append(); + } else { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .append(); + } + + // sort based on partition columns + env.execute(getClass().getSimpleName()); + + table.refresh(); + // ordered in reverse timeline from the newest snapshot to the oldest snapshot + List snapshots = Lists.newArrayList(table.snapshots().iterator()); + // only keep the snapshots with added data files + snapshots = + snapshots.stream() + .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) + .collect(Collectors.toList()); + + // Sometimes we will have more checkpoints than the bounded source if we pass the + // auto checkpoint interval. Thus producing multiple snapshots. + assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); + } + + @TestTemplate + public void testRangeDistributionWithNullValue() throws Exception { + assumeThat(partitioned).isTrue(); + + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) + .commit(); + + int numOfCheckpoints = 6; + List> charRows = createCharRows(numOfCheckpoints, 10); + charRows.add(ImmutableList.of(Row.of(1, null))); + DataStream dataStream = + env.addSource(createRangeDistributionBoundedSource(charRows), ROW_TYPE_INFO); + + if (isTableSchema) { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .append(); + } else { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .append(); + } + + // sort based on partition columns + env.execute(getClass().getSimpleName()); + + table.refresh(); + // ordered in reverse timeline from the newest snapshot to the oldest snapshot + List snapshots = Lists.newArrayList(table.snapshots().iterator()); + // only keep the snapshots with added data files + snapshots = + snapshots.stream() + .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) + .collect(Collectors.toList()); + + // Sometimes we will have more checkpoints than the bounded source if we pass the + // auto checkpoint interval. Thus producing multiple snapshots. + assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); + } + + @TestTemplate + public void testRangeDistributionWithSortOrder() throws Exception { + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) + .commit(); + table.replaceSortOrder().asc("data").commit(); + + int numOfCheckpoints = 6; + DataStream dataStream = + env.addSource( + createRangeDistributionBoundedSource(createCharRows(numOfCheckpoints, 10)), + ROW_TYPE_INFO); + + if (isTableSchema) { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .rangeDistributionStatisticsType(StatisticsType.Map) + .append(); + } else { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .rangeDistributionStatisticsType(StatisticsType.Map) + .append(); + } + + env.execute(getClass().getSimpleName()); + + table.refresh(); + // ordered in reverse timeline from the newest snapshot to the oldest snapshot + List snapshots = Lists.newArrayList(table.snapshots().iterator()); + // only keep the snapshots with added data files + snapshots = + snapshots.stream() + .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) + .collect(Collectors.toList()); + + // Sometimes we will have more checkpoints than the bounded source if we pass the + // auto checkpoint interval. Thus producing multiple snapshots. + assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); + + // It takes 2 checkpoint cycle for statistics collection and application + // of the globally aggregated statistics in the range partitioner. + // The last two checkpoints should have range shuffle applied + List rangePartitionedCycles = + snapshots.subList(snapshots.size() - 2, snapshots.size()); + + if (partitioned) { + for (Snapshot snapshot : rangePartitionedCycles) { + List addedDataFiles = + Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); + // up to 26 partitions + assertThat(addedDataFiles).hasSizeLessThanOrEqualTo(26); + } + } else { + for (Snapshot snapshot : rangePartitionedCycles) { + List addedDataFiles = + Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); + // each writer task should only write one file for non-partition sort column + assertThat(addedDataFiles).hasSize(writeParallelism); + // verify there is no overlap in min-max stats range + if (parallelism > 1) { + assertIdColumnStatsNoRangeOverlap(addedDataFiles.get(0), addedDataFiles.get(1)); + } + } + } + } + + @TestTemplate + public void testRangeDistributionSketchWithSortOrder() throws Exception { + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) + .commit(); + table.replaceSortOrder().asc("id").commit(); + + int numOfCheckpoints = 6; + DataStream dataStream = + env.addSource( + createRangeDistributionBoundedSource(createIntRows(numOfCheckpoints, 1_000)), + ROW_TYPE_INFO); + + if (isTableSchema) { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .rangeDistributionStatisticsType(StatisticsType.Sketch) + .append(); + } else { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .rangeDistributionStatisticsType(StatisticsType.Sketch) + .append(); + } + + env.execute(getClass().getSimpleName()); + + table.refresh(); + // ordered in reverse timeline from the newest snapshot to the oldest snapshot + List snapshots = Lists.newArrayList(table.snapshots().iterator()); + // only keep the snapshots with added data files + snapshots = + snapshots.stream() + .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) + .collect(Collectors.toList()); + + // Sometimes we will have more checkpoints than the bounded source if we pass the + // auto checkpoint interval. Thus producing multiple snapshots. + assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); + + // It takes 2 checkpoint cycle for statistics collection and application + // of the globally aggregated statistics in the range partitioner. + // The last two checkpoints should have range shuffle applied + List rangePartitionedCycles = + snapshots.subList(snapshots.size() - 2, snapshots.size()); + + // since the input has a single value for the data column, + // it is always the same partition. Hence there is no difference + // for partitioned or not + for (Snapshot snapshot : rangePartitionedCycles) { + List addedDataFiles = + Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); + // each writer task should only write one file for non-partition sort column + assertThat(addedDataFiles).hasSize(writeParallelism); + // verify there is no overlap in min-max stats range + if (writeParallelism > 2) { + assertIdColumnStatsNoRangeOverlap(addedDataFiles.get(0), addedDataFiles.get(1)); + } + } + } + + /** Test migration from Map stats to Sketch stats */ + @TestTemplate + public void testRangeDistributionStatisticsMigration() throws Exception { + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) + .commit(); + table.replaceSortOrder().asc("id").commit(); + + int numOfCheckpoints = 6; + List> rowsPerCheckpoint = Lists.newArrayListWithCapacity(numOfCheckpoints); + for (int checkpointId = 0; checkpointId < numOfCheckpoints; ++checkpointId) { + // checkpointId 2 would emit 11_000 records which is larger than + // the OPERATOR_SKETCH_SWITCH_THRESHOLD of 10_000. + // This should trigger the stats migration. + int maxId = checkpointId < 2 ? 1_000 : 11_000; + List rows = Lists.newArrayListWithCapacity(maxId); + for (int j = 0; j < maxId; ++j) { + // fixed value "a" for the data (possible partition column) + rows.add(Row.of(j, "a")); + } + + rowsPerCheckpoint.add(rows); + } + + DataStream dataStream = + env.addSource(createRangeDistributionBoundedSource(rowsPerCheckpoint), ROW_TYPE_INFO); + + if (isTableSchema) { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .rangeDistributionStatisticsType(StatisticsType.Auto) + .append(); + } else { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(writeParallelism) + .rangeDistributionStatisticsType(StatisticsType.Auto) + .append(); + } + + env.execute(getClass().getSimpleName()); + + table.refresh(); + // ordered in reverse timeline from the newest snapshot to the oldest snapshot + List snapshots = Lists.newArrayList(table.snapshots().iterator()); + // only keep the snapshots with added data files + snapshots = + snapshots.stream() + .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) + .collect(Collectors.toList()); + + // Sometimes we will have more checkpoints than the bounded source if we pass the + // auto checkpoint interval. Thus producing multiple snapshots. + assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); + + // It takes 2 checkpoint cycle for statistics collection and application + // of the globally aggregated statistics in the range partitioner. + // The last two checkpoints should have range shuffle applied + List rangePartitionedCycles = + snapshots.subList(snapshots.size() - 2, snapshots.size()); + + // since the input has a single value for the data column, + // it is always the same partition. Hence there is no difference + // for partitioned or not + for (Snapshot snapshot : rangePartitionedCycles) { + List addedDataFiles = + Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); + // each writer task should only write one file for non-partition sort column + // sometimes + assertThat(addedDataFiles).hasSize(writeParallelism); + // verify there is no overlap in min-max stats range + if (writeParallelism > 1) { + assertIdColumnStatsNoRangeOverlap(addedDataFiles.get(0), addedDataFiles.get(1)); + } + } + } + + private BoundedTestSource createRangeDistributionBoundedSource( + List> rowsPerCheckpoint) { + return new BoundedTestSource<>(rowsPerCheckpoint); + } + + private List> createCharRows(int numOfCheckpoints, int countPerChar) { + List> rowsPerCheckpoint = Lists.newArrayListWithCapacity(numOfCheckpoints); + for (int checkpointId = 0; checkpointId < numOfCheckpoints; ++checkpointId) { + List rows = Lists.newArrayListWithCapacity(26 * countPerChar); + for (int j = 0; j < countPerChar; ++j) { + for (char c = 'a'; c <= 'z'; ++c) { + rows.add(Row.of(1, String.valueOf(c))); + } + } + + rowsPerCheckpoint.add(rows); + } + + return rowsPerCheckpoint; + } + + private List> createIntRows(int numOfCheckpoints, int maxId) { + List> rowsPerCheckpoint = Lists.newArrayListWithCapacity(numOfCheckpoints); + for (int checkpointId = 0; checkpointId < numOfCheckpoints; ++checkpointId) { + List rows = Lists.newArrayListWithCapacity(maxId); + for (int j = 0; j < maxId; ++j) { + // fixed value "a" for the data (possible partition column) + rows.add(Row.of(j, "a")); + } + + rowsPerCheckpoint.add(rows); + } + + return rowsPerCheckpoint; + } + + private void assertIdColumnStatsNoRangeOverlap(DataFile file1, DataFile file2) { + // id column has fieldId 1 + int file1LowerBound = + Conversions.fromByteBuffer(Types.IntegerType.get(), file1.lowerBounds().get(1)); + int file1UpperBound = + Conversions.fromByteBuffer(Types.IntegerType.get(), file1.upperBounds().get(1)); + int file2LowerBound = + Conversions.fromByteBuffer(Types.IntegerType.get(), file2.lowerBounds().get(1)); + int file2UpperBound = + Conversions.fromByteBuffer(Types.IntegerType.get(), file2.upperBounds().get(1)); + + if (file1LowerBound < file2LowerBound) { + assertThat(file1UpperBound).isLessThanOrEqualTo(file2LowerBound); + } else { + assertThat(file2UpperBound).isLessThanOrEqualTo(file1LowerBound); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java new file mode 100644 index 000000000000..c21c3d5cc21b --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java @@ -0,0 +1,313 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.flink.core.io.SimpleVersionedSerialization; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.table.data.RowData; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.ManifestFiles; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.Pair; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestFlinkManifest { + private static final Configuration CONF = new Configuration(); + + @TempDir protected Path temporaryFolder; + + private Table table; + private FileAppenderFactory appenderFactory; + private final AtomicInteger fileCount = new AtomicInteger(0); + + @BeforeEach + public void before() throws IOException { + File folder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); + String warehouse = folder.getAbsolutePath(); + + String tablePath = warehouse.concat("/test"); + assertThat(new File(tablePath).mkdir()).isTrue(); + + // Construct the iceberg table. + table = SimpleDataUtil.createTable(tablePath, ImmutableMap.of(), false); + + int[] equalityFieldIds = + new int[] { + table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId() + }; + this.appenderFactory = + new FlinkAppenderFactory( + table, + table.schema(), + FlinkSchemaUtil.convert(table.schema()), + table.properties(), + table.spec(), + equalityFieldIds, + table.schema(), + null); + } + + @Test + public void testIO() throws IOException { + String flinkJobId = newFlinkJobId(); + String operatorId = newOperatorUniqueId(); + for (long checkpointId = 1; checkpointId <= 3; checkpointId++) { + ManifestOutputFileFactory factory = + FlinkManifestUtil.createOutputFileFactory( + () -> table, table.properties(), flinkJobId, operatorId, 1, 1); + final long curCkpId = checkpointId; + + List dataFiles = generateDataFiles(10); + List eqDeleteFiles = generateEqDeleteFiles(5); + List posDeleteFiles = generatePosDeleteFiles(5); + DeltaManifests deltaManifests = + FlinkManifestUtil.writeCompletedFiles( + WriteResult.builder() + .addDataFiles(dataFiles) + .addDeleteFiles(eqDeleteFiles) + .addDeleteFiles(posDeleteFiles) + .build(), + () -> factory.create(curCkpId), + table.spec()); + + WriteResult result = + FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io(), table.specs()); + assertThat(result.deleteFiles()).hasSize(10); + for (int i = 0; i < dataFiles.size(); i++) { + TestHelpers.assertEquals(dataFiles.get(i), result.dataFiles()[i]); + } + assertThat(result.deleteFiles()).hasSize(10); + for (int i = 0; i < 5; i++) { + TestHelpers.assertEquals(eqDeleteFiles.get(i), result.deleteFiles()[i]); + } + for (int i = 0; i < 5; i++) { + TestHelpers.assertEquals(posDeleteFiles.get(i), result.deleteFiles()[5 + i]); + } + } + } + + @Test + public void testUserProvidedManifestLocation() throws IOException { + long checkpointId = 1; + String flinkJobId = newFlinkJobId(); + String operatorId = newOperatorUniqueId(); + File userProvidedFolder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); + Map props = + ImmutableMap.of( + ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION, + userProvidedFolder.getAbsolutePath() + "///"); + ManifestOutputFileFactory factory = + new ManifestOutputFileFactory(() -> table, props, flinkJobId, operatorId, 1, 1); + + List dataFiles = generateDataFiles(5); + DeltaManifests deltaManifests = + FlinkManifestUtil.writeCompletedFiles( + WriteResult.builder().addDataFiles(dataFiles).build(), + () -> factory.create(checkpointId), + table.spec()); + + assertThat(deltaManifests.dataManifest()).isNotNull(); + assertThat(deltaManifests.deleteManifest()).isNull(); + assertThat(Paths.get(deltaManifests.dataManifest().path())) + .hasParent(userProvidedFolder.toPath()); + + WriteResult result = + FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io(), table.specs()); + + assertThat(result.deleteFiles()).isEmpty(); + assertThat(result.dataFiles()).hasSize(5); + + assertThat(result.dataFiles()).hasSameSizeAs(dataFiles); + for (int i = 0; i < dataFiles.size(); i++) { + TestHelpers.assertEquals(dataFiles.get(i), result.dataFiles()[i]); + } + } + + @Test + public void testVersionedSerializer() throws IOException { + long checkpointId = 1; + String flinkJobId = newFlinkJobId(); + String operatorId = newOperatorUniqueId(); + ManifestOutputFileFactory factory = + FlinkManifestUtil.createOutputFileFactory( + () -> table, table.properties(), flinkJobId, operatorId, 1, 1); + + List dataFiles = generateDataFiles(10); + List eqDeleteFiles = generateEqDeleteFiles(10); + List posDeleteFiles = generatePosDeleteFiles(10); + DeltaManifests expected = + FlinkManifestUtil.writeCompletedFiles( + WriteResult.builder() + .addDataFiles(dataFiles) + .addDeleteFiles(eqDeleteFiles) + .addDeleteFiles(posDeleteFiles) + .build(), + () -> factory.create(checkpointId), + table.spec()); + + byte[] versionedSerializeData = + SimpleVersionedSerialization.writeVersionAndSerialize( + DeltaManifestsSerializer.INSTANCE, expected); + DeltaManifests actual = + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, versionedSerializeData); + TestHelpers.assertEquals(expected.dataManifest(), actual.dataManifest()); + TestHelpers.assertEquals(expected.deleteManifest(), actual.deleteManifest()); + + byte[] versionedSerializeData2 = + SimpleVersionedSerialization.writeVersionAndSerialize( + DeltaManifestsSerializer.INSTANCE, actual); + assertThat(versionedSerializeData2).containsExactly(versionedSerializeData); + } + + @Test + public void testCompatibility() throws IOException { + // The v2 deserializer should be able to deserialize the v1 binary. + long checkpointId = 1; + String flinkJobId = newFlinkJobId(); + String operatorId = newOperatorUniqueId(); + ManifestOutputFileFactory factory = + FlinkManifestUtil.createOutputFileFactory( + () -> table, table.properties(), flinkJobId, operatorId, 1, 1); + + List dataFiles = generateDataFiles(10); + ManifestFile manifest = + FlinkManifestUtil.writeDataFiles(factory.create(checkpointId), table.spec(), dataFiles); + byte[] dataV1 = + SimpleVersionedSerialization.writeVersionAndSerialize(new V1Serializer(), manifest); + + DeltaManifests delta = + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, dataV1); + assertThat(delta.deleteManifest()).isNull(); + assertThat(delta.dataManifest()).isNotNull(); + TestHelpers.assertEquals(manifest, delta.dataManifest()); + + List actualFiles = + FlinkManifestUtil.readDataFiles(delta.dataManifest(), table.io(), table.specs()); + assertThat(actualFiles).hasSize(10); + for (int i = 0; i < 10; i++) { + TestHelpers.assertEquals(dataFiles.get(i), actualFiles.get(i)); + } + } + + private static class V1Serializer implements SimpleVersionedSerializer { + + @Override + public int getVersion() { + return 1; + } + + @Override + public byte[] serialize(ManifestFile m) throws IOException { + return ManifestFiles.encode(m); + } + + @Override + public ManifestFile deserialize(int version, byte[] serialized) throws IOException { + return ManifestFiles.decode(serialized); + } + } + + private DataFile writeDataFile(String filename, List rows) throws IOException { + return SimpleDataUtil.writeFile( + table, + table.schema(), + table.spec(), + CONF, + table.location(), + FileFormat.PARQUET.addExtension(filename), + rows); + } + + private DeleteFile writeEqDeleteFile(String filename, List deletes) throws IOException { + return SimpleDataUtil.writeEqDeleteFile( + table, FileFormat.PARQUET, filename, appenderFactory, deletes); + } + + private DeleteFile writePosDeleteFile(String filename, List> positions) + throws IOException { + return SimpleDataUtil.writePosDeleteFile( + table, FileFormat.PARQUET, filename, appenderFactory, positions); + } + + private List generateDataFiles(int fileNum) throws IOException { + List rowDataList = Lists.newArrayList(); + List dataFiles = Lists.newArrayList(); + for (int i = 0; i < fileNum; i++) { + rowDataList.add(SimpleDataUtil.createRowData(i, "a" + i)); + dataFiles.add(writeDataFile("data-file-" + fileCount.incrementAndGet(), rowDataList)); + } + return dataFiles; + } + + private List generateEqDeleteFiles(int fileNum) throws IOException { + List rowDataList = Lists.newArrayList(); + List deleteFiles = Lists.newArrayList(); + for (int i = 0; i < fileNum; i++) { + rowDataList.add(SimpleDataUtil.createDelete(i, "a" + i)); + deleteFiles.add( + writeEqDeleteFile("eq-delete-file-" + fileCount.incrementAndGet(), rowDataList)); + } + return deleteFiles; + } + + private List generatePosDeleteFiles(int fileNum) throws IOException { + List> positions = Lists.newArrayList(); + List deleteFiles = Lists.newArrayList(); + for (int i = 0; i < fileNum; i++) { + positions.add(Pair.of("data-file-1", (long) i)); + deleteFiles.add( + writePosDeleteFile("pos-delete-file-" + fileCount.incrementAndGet(), positions)); + } + return deleteFiles; + } + + private static String newFlinkJobId() { + return UUID.randomUUID().toString(); + } + + private static String newOperatorUniqueId() { + return UUID.randomUUID().toString(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java new file mode 100644 index 000000000000..939ed2be7dbc --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.Arrays; +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.io.FileWriterFactory; +import org.apache.iceberg.io.TestPartitioningWriters; +import org.apache.iceberg.util.ArrayUtil; +import org.apache.iceberg.util.StructLikeSet; + +public class TestFlinkPartitioningWriters extends TestPartitioningWriters { + + @Parameters(name = "formatVersion = {0}, fileFormat = {1}") + protected static List parameters() { + return Arrays.asList( + new Object[] {2, FileFormat.AVRO}, + new Object[] {2, FileFormat.PARQUET}, + new Object[] {2, FileFormat.ORC}); + } + + @Override + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { + return FlinkFileWriterFactory.builderFor(table) + .dataSchema(table.schema()) + .dataFileFormat(format()) + .deleteFileFormat(format()) + .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) + .equalityDeleteRowSchema(equalityDeleteRowSchema) + .positionDeleteRowSchema(positionDeleteRowSchema) + .build(); + } + + @Override + protected RowData toRow(Integer id, String data) { + return SimpleDataUtil.createRowData(id, data); + } + + @Override + protected StructLikeSet toSet(Iterable rows) { + StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); + RowType flinkType = FlinkSchemaUtil.convert(table.schema()); + for (RowData row : rows) { + RowDataWrapper wrapper = new RowDataWrapper(flinkType, table.schema().asStruct()); + set.add(wrapper.wrap(row)); + } + return set; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java new file mode 100644 index 000000000000..3050752d1c24 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.io.FileWriterFactory; +import org.apache.iceberg.io.TestPositionDeltaWriters; +import org.apache.iceberg.util.ArrayUtil; +import org.apache.iceberg.util.StructLikeSet; + +public class TestFlinkPositionDeltaWriters extends TestPositionDeltaWriters { + + @Override + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { + return FlinkFileWriterFactory.builderFor(table) + .dataSchema(table.schema()) + .dataFileFormat(format()) + .deleteFileFormat(format()) + .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) + .equalityDeleteRowSchema(equalityDeleteRowSchema) + .positionDeleteRowSchema(positionDeleteRowSchema) + .build(); + } + + @Override + protected RowData toRow(Integer id, String data) { + return SimpleDataUtil.createRowData(id, data); + } + + @Override + protected StructLikeSet toSet(Iterable rows) { + StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); + RowType flinkType = FlinkSchemaUtil.convert(table.schema()); + for (RowData row : rows) { + RowDataWrapper wrapper = new RowDataWrapper(flinkType, table.schema().asStruct()); + set.add(wrapper.wrap(row)); + } + return set; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java new file mode 100644 index 000000000000..03051b69cf87 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.io.FileWriterFactory; +import org.apache.iceberg.io.TestRollingFileWriters; +import org.apache.iceberg.util.ArrayUtil; + +public class TestFlinkRollingFileWriters extends TestRollingFileWriters { + + @Override + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { + return FlinkFileWriterFactory.builderFor(table) + .dataSchema(table.schema()) + .dataFileFormat(format()) + .deleteFileFormat(format()) + .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) + .equalityDeleteRowSchema(equalityDeleteRowSchema) + .positionDeleteRowSchema(positionDeleteRowSchema) + .build(); + } + + @Override + protected RowData toRow(Integer id, String data) { + return SimpleDataUtil.createRowData(id, data); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java new file mode 100644 index 000000000000..e6d64ef2c720 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Table; +import org.apache.iceberg.io.FileWriterFactory; +import org.apache.iceberg.io.TestWriterMetrics; + +public class TestFlinkWriterMetrics extends TestWriterMetrics { + + public TestFlinkWriterMetrics(FileFormat fileFormat) { + super(fileFormat); + } + + @Override + protected FileWriterFactory newWriterFactory(Table sourceTable) { + return FlinkFileWriterFactory.builderFor(sourceTable) + .dataSchema(sourceTable.schema()) + .dataFileFormat(fileFormat) + .deleteFileFormat(fileFormat) + .positionDeleteRowSchema(sourceTable.schema()) + .build(); + } + + @Override + protected RowData toRow(Integer id, String data, boolean boolValue, Long longValue) { + GenericRowData nested = GenericRowData.of(boolValue, longValue); + GenericRowData row = GenericRowData.of(id, StringData.fromString(data), nested); + return row; + } + + @Override + public RowData toGenericRow(int value, int repeated) { + GenericRowData row = new GenericRowData(repeated); + for (int i = 0; i < repeated; i++) { + row.setField(i, value); + } + return row; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergCommitter.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergCommitter.java new file mode 100644 index 000000000000..76338a185a62 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergCommitter.java @@ -0,0 +1,1435 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; +import static org.apache.iceberg.flink.sink.SinkTestUtil.extractAndAssertCommittableSummary; +import static org.apache.iceberg.flink.sink.SinkTestUtil.extractAndAssertCommittableWithLineage; +import static org.apache.iceberg.flink.sink.SinkTestUtil.transformsToStreamElement; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assumptions.assumeThat; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.spy; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import org.apache.flink.api.common.JobID; +import org.apache.flink.api.common.TaskInfo; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; +import org.apache.flink.api.connector.sink2.Committer; +import org.apache.flink.core.memory.DataInputView; +import org.apache.flink.core.memory.DataOutputView; +import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; +import org.apache.flink.runtime.execution.Environment; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessageSerializer; +import org.apache.flink.streaming.api.connector.sink2.CommittableSummary; +import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; +import org.apache.flink.streaming.api.connector.sink2.SinkV2Assertions; +import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; +import org.apache.flink.streaming.runtime.operators.sink.CommitterOperatorFactory; +import org.apache.flink.streaming.runtime.streamrecord.StreamElement; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.runtime.tasks.StreamTask; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.table.data.RowData; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.ManifestFiles; +import org.apache.iceberg.ManifestWriter; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.SnapshotSummary; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.TestBase; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@ExtendWith(ParameterizedTestExtension.class) +class TestIcebergCommitter extends TestBase { + private static final Logger LOG = LoggerFactory.getLogger(TestIcebergCommitter.class); + public static final String OPERATOR_ID = "flink-sink"; + @TempDir File temporaryFolder; + + @TempDir File flinkManifestFolder; + + private Table table; + + private TableLoader tableLoader; + + @Parameter(index = 1) + private Boolean isStreamingMode; + + @Parameter(index = 2) + private String branch; + + private final String jobId = "jobId"; + private final long dataFIleRowCount = 5L; + + private final TestCommittableMessageTypeSerializer committableMessageTypeSerializer = + new TestCommittableMessageTypeSerializer(); + + private final DataFile dataFileTest1 = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(0) + .withMetrics( + new Metrics( + dataFIleRowCount, + null, // no column sizes + ImmutableMap.of(1, 5L), // value count + ImmutableMap.of(1, 0L), // null count + null, + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(4L)) // upper bounds + )) + .build(); + + private final DataFile dataFileTest2 = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withPath("/path/to/data-2.parquet") + .withFileSizeInBytes(0) + .withMetrics( + new Metrics( + dataFIleRowCount, + null, // no column sizes + ImmutableMap.of(1, 5L), // value count + ImmutableMap.of(1, 0L), // null count + null, + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(4L)) // upper bounds + )) + .build(); + + @SuppressWarnings("checkstyle:NestedForDepth") + @Parameters(name = "formatVersion={0} isStreaming={1}, branch={2}") + protected static List parameters() { + List parameters = Lists.newArrayList(); + for (Boolean isStreamingMode : new Boolean[] {true, false}) { + for (int formatVersion : org.apache.iceberg.TestHelpers.ALL_VERSIONS) { + parameters.add(new Object[] {formatVersion, isStreamingMode, SnapshotRef.MAIN_BRANCH}); + parameters.add(new Object[] {formatVersion, isStreamingMode, "test-branch"}); + } + } + return parameters; + } + + @BeforeEach + public void before() throws Exception { + String warehouse = temporaryFolder.getAbsolutePath(); + + String tablePath = warehouse.concat("/test"); + assertThat(new File(tablePath).mkdir()).as("Should create the table path correctly.").isTrue(); + + Map props = + ImmutableMap.of( + TableProperties.FORMAT_VERSION, + String.valueOf(formatVersion), + FLINK_MANIFEST_LOCATION, + flinkManifestFolder.getAbsolutePath(), + IcebergCommitter.MAX_CONTINUOUS_EMPTY_COMMITS, + "1"); + table = SimpleDataUtil.createTable(tablePath, props, false); + tableLoader = TableLoader.fromHadoopTable(tablePath); + } + + @TestTemplate + public void testCommitTxnWithoutDataFiles() throws Exception { + IcebergCommitter committer = getCommitter(); + SimpleDataUtil.assertTableRows(table, Lists.newArrayList(), branch); + assertSnapshotSize(0); + assertMaxCommittedCheckpointId(jobId, -1); + + for (long i = 1; i <= 3; i++) { + Committer.CommitRequest commitRequest = + buildCommitRequestFor(jobId, i, Lists.newArrayList()); + committer.commit(Lists.newArrayList(commitRequest)); + assertMaxCommittedCheckpointId(jobId, i); + assertSnapshotSize((int) i); + } + } + + @TestTemplate + public void testMxContinuousEmptyCommits() throws Exception { + table.updateProperties().set(IcebergCommitter.MAX_CONTINUOUS_EMPTY_COMMITS, "3").commit(); + IcebergCommitter committer = getCommitter(); + for (int i = 1; i <= 9; i++) { + Committer.CommitRequest commitRequest = + buildCommitRequestFor(jobId, i, Lists.newArrayList()); + committer.commit(Lists.newArrayList(commitRequest)); + assertFlinkManifests(0); + assertSnapshotSize(i / 3); + } + } + + @TestTemplate + public void testCommitTxn() throws Exception { + IcebergCommitter committer = getCommitter(); + assertSnapshotSize(0); + List rows = Lists.newArrayListWithExpectedSize(3); + for (int i = 1; i <= 3; i++) { + RowData rowData = SimpleDataUtil.createRowData(i, "hello" + i); + DataFile dataFile = writeDataFile("data-" + i, ImmutableList.of(rowData)); + rows.add(rowData); + WriteResult writeResult = of(dataFile); + Committer.CommitRequest commitRequest = + buildCommitRequestFor(jobId, i, Lists.newArrayList(writeResult)); + committer.commit(Lists.newArrayList(commitRequest)); + assertFlinkManifests(0); + SimpleDataUtil.assertTableRows(table, ImmutableList.copyOf(rows), branch); + assertSnapshotSize(i); + assertMaxCommittedCheckpointId(jobId, i); + Map summary = SimpleDataUtil.latestSnapshot(table, branch).summary(); + assertThat(summary) + .containsEntry("flink.test", "org.apache.iceberg.flink.sink.TestIcebergCommitter") + .containsEntry("added-data-files", "1") + .containsEntry("flink.operator-id", OPERATOR_ID) + .containsEntry("flink.job-id", "jobId"); + } + } + + @TestTemplate + public void testOrderedEventsBetweenCheckpoints() throws Exception { + // It's possible that two checkpoints happen in the following orders: + // 1. snapshotState for checkpoint#1; + // 2. snapshotState for checkpoint#2; + // 3. notifyCheckpointComplete for checkpoint#1; + // 4. notifyCheckpointComplete for checkpoint#2; + + long timestamp = 0; + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + harness = getTestHarness()) { + + harness.open(); + + assertMaxCommittedCheckpointId(jobId, -1L); + + RowData row1 = SimpleDataUtil.createRowData(1, "hello"); + DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); + + processElement(jobId, 1, harness, 1, OPERATOR_ID, dataFile1); + assertMaxCommittedCheckpointId(jobId, -1L); + + // 1. snapshotState for checkpoint#1 + long firstCheckpointId = 1; + harness.snapshot(firstCheckpointId, ++timestamp); + assertFlinkManifests(1); + + RowData row2 = SimpleDataUtil.createRowData(2, "world"); + DataFile dataFile2 = writeDataFile("data-2", ImmutableList.of(row2)); + processElement(jobId, 2, harness, 1, OPERATOR_ID, dataFile2); + assertMaxCommittedCheckpointId(jobId, -1L); + + // 2. snapshotState for checkpoint#2 + long secondCheckpointId = 2; + OperatorSubtaskState snapshot = harness.snapshot(secondCheckpointId, ++timestamp); + assertFlinkManifests(2); + + // 3. notifyCheckpointComplete for checkpoint#1 + harness.notifyOfCompletedCheckpoint(firstCheckpointId); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); + assertMaxCommittedCheckpointId(jobId, firstCheckpointId); + assertFlinkManifests(1); + + // 4. notifyCheckpointComplete for checkpoint#2 + harness.notifyOfCompletedCheckpoint(secondCheckpointId); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2), branch); + assertMaxCommittedCheckpointId(jobId, secondCheckpointId); + assertFlinkManifests(0); + } + } + + @TestTemplate + public void testDisorderedEventsBetweenCheckpoints() throws Exception { + // It's possible that two checkpoints happen in the following orders: + // 1. snapshotState for checkpoint#1; + // 2. snapshotState for checkpoint#2; + // 3. notifyCheckpointComplete for checkpoint#2; + // 4. notifyCheckpointComplete for checkpoint#1; + + long timestamp = 0; + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + harness = getTestHarness()) { + + harness.open(); + assertMaxCommittedCheckpointId(jobId, -1L); + + RowData row1 = SimpleDataUtil.createRowData(1, "hello"); + DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); + + processElement(jobId, 1, harness, 1, OPERATOR_ID, dataFile1); + assertMaxCommittedCheckpointId(jobId, -1L); + + // 1. snapshotState for checkpoint#1 + long firstCheckpointId = 1; + harness.snapshot(firstCheckpointId, ++timestamp); + assertFlinkManifests(1); + + RowData row2 = SimpleDataUtil.createRowData(2, "world"); + DataFile dataFile2 = writeDataFile("data-2", ImmutableList.of(row2)); + processElement(jobId, 2, harness, 1, OPERATOR_ID, dataFile2); + assertMaxCommittedCheckpointId(jobId, -1L); + + // 2. snapshotState for checkpoint#2 + long secondCheckpointId = 2; + harness.snapshot(secondCheckpointId, ++timestamp); + assertFlinkManifests(2); + + // 3. notifyCheckpointComplete for checkpoint#2 + harness.notifyOfCompletedCheckpoint(secondCheckpointId); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2), branch); + assertMaxCommittedCheckpointId(jobId, secondCheckpointId); + assertFlinkManifests(0); + + // 4. notifyCheckpointComplete for checkpoint#1 + harness.notifyOfCompletedCheckpoint(firstCheckpointId); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2), branch); + assertMaxCommittedCheckpointId(jobId, secondCheckpointId); + assertFlinkManifests(0); + } + } + + @TestTemplate + public void testSingleCommit() throws Exception { + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + testHarness = getTestHarness()) { + testHarness.open(); + + long checkpointId = 1; + + RowData row1 = SimpleDataUtil.createRowData(1, "hello1"); + DataFile dataFile1 = writeDataFile("data-1-1", ImmutableList.of(row1)); + CommittableSummary committableSummary = + processElement(jobId, checkpointId, testHarness, 1, OPERATOR_ID, dataFile1); + + // Trigger commit + testHarness.notifyOfCompletedCheckpoint(checkpointId); + + assertSnapshotSize(1); + assertMaxCommittedCheckpointId(jobId, 1L); + + List output = transformsToStreamElement(testHarness.getOutput()); + + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); + SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output.get(0))) + .hasFailedCommittables(committableSummary.getNumberOfFailedCommittables()) + .hasOverallCommittables(committableSummary.getNumberOfCommittables()); + + SinkV2Assertions.assertThat(extractAndAssertCommittableWithLineage(output.get(1))) + .hasSubtaskId(0) + .hasCheckpointId(checkpointId); + } + + table.refresh(); + Snapshot currentSnapshot = table.snapshot(branch); + + assertThat(currentSnapshot.summary()) + .containsEntry(SnapshotSummary.TOTAL_RECORDS_PROP, "1") + .containsEntry(SnapshotSummary.TOTAL_DATA_FILES_PROP, "1"); + } + + /** The data was not committed in the previous job. */ + @TestTemplate + public void testStateRestoreFromPreJobWithUncommitted() throws Exception { + String jobId1 = "jobId1"; + OperatorSubtaskState snapshot; + + // We cannot test a different checkpoint thant 0 because when using the OperatorTestHarness + // for recovery the lastCompleted checkpoint is always reset to 0. + // see: https://github.com/apache/iceberg/issues/10942 + long checkpointId = 0; + long timestamp = 0; + CommittableSummary committableSummary; + + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + preJobTestHarness = getTestHarness()) { + + preJobTestHarness.open(); + + committableSummary = + processElement(jobId1, checkpointId, preJobTestHarness, 1, OPERATOR_ID, dataFileTest1); + + snapshot = preJobTestHarness.snapshot(checkpointId, ++timestamp); + + assertThat(preJobTestHarness.getOutput()).isEmpty(); + } + + assertSnapshotSize(0); + assertMaxCommittedCheckpointId(jobId1, -1L); + + String jobId2 = "jobId2"; + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + restored = getTestHarness()) { + restored.setup(committableMessageTypeSerializer); + restored.initializeState(snapshot); + restored.open(); + + // Previous committables are immediately committed if possible + List output = transformsToStreamElement(restored.getOutput()); + assertThat(output).hasSize(2); + + SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output.get(0))) + .hasFailedCommittables(committableSummary.getNumberOfFailedCommittables()) + .hasOverallCommittables(committableSummary.getNumberOfCommittables()); + + SinkV2Assertions.assertThat(extractAndAssertCommittableWithLineage(output.get(1))) + .hasCheckpointId(0L) + .hasSubtaskId(0); + + table.refresh(); + + Snapshot currentSnapshot = table.snapshot(branch); + + assertThat(currentSnapshot.summary()) + .containsEntry(SnapshotSummary.TOTAL_RECORDS_PROP, String.valueOf(dataFIleRowCount)) + .containsEntry(SnapshotSummary.TOTAL_DATA_FILES_PROP, "1") + .containsEntry("flink.job-id", jobId1); + + checkpointId++; + CommittableSummary committableSummary2 = + processElement(jobId2, checkpointId, restored, 1, OPERATOR_ID, dataFileTest2); + + // Trigger commit + restored.notifyOfCompletedCheckpoint(checkpointId); + + List output2 = transformsToStreamElement(restored.getOutput()); + SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output2.get(0))) + .hasFailedCommittables(committableSummary2.getNumberOfFailedCommittables()) + .hasOverallCommittables(committableSummary2.getNumberOfCommittables()); + + SinkV2Assertions.assertThat(extractAndAssertCommittableWithLineage(output2.get(1))) + .hasCheckpointId(0L) + .hasSubtaskId(0); + } + + assertSnapshotSize(2); + assertMaxCommittedCheckpointId(jobId2, 1); + + table.refresh(); + Snapshot currentSnapshot2 = table.snapshot(branch); + + assertThat(currentSnapshot2.summary()) + .containsEntry(SnapshotSummary.TOTAL_RECORDS_PROP, String.valueOf(dataFIleRowCount * 2)) + .containsEntry(SnapshotSummary.TOTAL_DATA_FILES_PROP, "2") + .containsEntry("flink.job-id", jobId2); + } + + /** The data was committed in the previous job. */ + @TestTemplate + public void testStateRestoreFromPreJobWithCommitted() throws Exception { + String jobId1 = "jobId1"; + OperatorSubtaskState snapshot; + + // We cannot test a different checkpoint thant 0 because when using the OperatorTestHarness + // for recovery the lastCompleted checkpoint is always reset to 0. + // see: https://github.com/apache/iceberg/issues/10942 + long checkpointId = 0; + + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + preJobTestHarness = getTestHarness()) { + + preJobTestHarness.open(); + + CommittableSummary committableSummary = + processElement(jobId1, checkpointId, preJobTestHarness, 1, OPERATOR_ID, dataFileTest1); + + assertFlinkManifests(1); + snapshot = preJobTestHarness.snapshot(checkpointId, 2L); + // commit snapshot + preJobTestHarness.notifyOfCompletedCheckpoint(checkpointId); + + List output = transformsToStreamElement(preJobTestHarness.getOutput()); + assertThat(output).hasSize(2); + + SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output.get(0))) + .hasFailedCommittables(committableSummary.getNumberOfFailedCommittables()) + .hasOverallCommittables(committableSummary.getNumberOfCommittables()); + + assertSnapshotSize(1); + assertMaxCommittedCheckpointId(jobId1, checkpointId); + } + + table.refresh(); + long preJobSnapshotId = table.snapshot(branch).snapshotId(); + + String jobId2 = "jobId2"; + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + restored = getTestHarness()) { + restored.setup(); + restored.initializeState(snapshot); + restored.open(); + + // Makes sure that data committed in the previous job is available in this job + List output2 = transformsToStreamElement(restored.getOutput()); + assertThat(output2).hasSize(2); + + table.refresh(); + long restoredSnapshotId = table.snapshot(branch).snapshotId(); + + assertThat(restoredSnapshotId) + .as("The table does not generate a new snapshot without data being committed.") + .isEqualTo(preJobSnapshotId); + + assertThat(table.snapshot(branch).summary()) + .containsEntry(SnapshotSummary.TOTAL_RECORDS_PROP, String.valueOf(dataFIleRowCount)) + .containsEntry(SnapshotSummary.TOTAL_DATA_FILES_PROP, "1") + .containsEntry("flink.job-id", jobId1); + + // Commit new data file + checkpointId = 1; + CommittableSummary committableSummary2 = + processElement(jobId2, checkpointId, restored, 1, OPERATOR_ID, dataFileTest2); + + // Trigger commit + restored.notifyOfCompletedCheckpoint(checkpointId); + + List output3 = transformsToStreamElement(restored.getOutput()); + assertThat(output3).hasSize(4); + SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output3.get(0))) + .hasFailedCommittables(committableSummary2.getNumberOfFailedCommittables()) + .hasOverallCommittables(committableSummary2.getNumberOfCommittables()); + } + + assertSnapshotSize(2); + assertMaxCommittedCheckpointId(jobId2, 1L); + + table.refresh(); + Snapshot currentSnapshot2 = table.snapshot(branch); + assertThat(Long.parseLong(currentSnapshot2.summary().get(SnapshotSummary.TOTAL_RECORDS_PROP))) + .isEqualTo(dataFIleRowCount * 2); + + assertThat(currentSnapshot2.summary()) + .containsEntry(SnapshotSummary.TOTAL_RECORDS_PROP, String.valueOf(dataFIleRowCount * 2)) + .containsEntry(SnapshotSummary.TOTAL_DATA_FILES_PROP, "2") + .containsEntry("flink.job-id", jobId2); + } + + @TestTemplate + public void testStateRestoreFromCurrJob() throws Exception { + String jobId1 = "jobId1"; + CommittableSummary committableSummary; + OperatorSubtaskState snapshot; + + // We cannot test a different checkpoint thant 0 because when using the OperatorTestHarness + // for recovery the lastCompleted checkpoint is always reset to 0. + // see: https://github.com/apache/iceberg/issues/10942 + long checkpointId = 0; + + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + testHarness = getTestHarness()) { + + testHarness.open(); + + committableSummary = + processElement(jobId1, checkpointId, testHarness, 1, OPERATOR_ID, dataFileTest1); + snapshot = testHarness.snapshot(checkpointId, 2L); + + assertThat(testHarness.getOutput()).isEmpty(); + } + + assertSnapshotSize(0); + assertMaxCommittedCheckpointId(jobId1, -1L); + + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + restored = getTestHarness()) { + + restored.setup(committableMessageTypeSerializer); + + restored.initializeState(snapshot); + restored.open(); + + // Previous committables are immediately committed if possible + List output = transformsToStreamElement(restored.getOutput()); + assertThat(output).hasSize(2); + + SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output.get(0))) + .hasFailedCommittables(committableSummary.getNumberOfFailedCommittables()) + .hasOverallCommittables(committableSummary.getNumberOfCommittables()); + + table.refresh(); + Snapshot currentSnapshot = table.snapshot(branch); + + assertThat(currentSnapshot.summary()) + .containsEntry(SnapshotSummary.TOTAL_RECORDS_PROP, String.valueOf(dataFIleRowCount)) + .containsEntry(SnapshotSummary.TOTAL_DATA_FILES_PROP, "1") + .containsEntry("flink.job-id", jobId1); + + String jobId2 = "jobId2"; + checkpointId = 1; + CommittableSummary committableSummary2 = + processElement(jobId2, checkpointId, restored, 1, OPERATOR_ID, dataFileTest2); + + // Trigger commit + restored.notifyOfCompletedCheckpoint(checkpointId); + + List output2 = transformsToStreamElement(restored.getOutput()); + SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output2.get(0))) + .hasFailedCommittables(committableSummary2.getNumberOfFailedCommittables()) + .hasOverallCommittables(committableSummary2.getNumberOfCommittables()); + restored.close(); + + assertSnapshotSize(2); + assertMaxCommittedCheckpointId(jobId2, 1L); + + table.refresh(); + Snapshot currentSnapshot2 = table.snapshot(branch); + assertThat(currentSnapshot2.summary()) + .containsEntry(SnapshotSummary.TOTAL_RECORDS_PROP, String.valueOf(dataFIleRowCount * 2)) + .containsEntry(SnapshotSummary.TOTAL_DATA_FILES_PROP, "2") + .containsEntry("flink.job-id", jobId2); + } + } + + @TestTemplate + public void testRecoveryFromSnapshotWithoutCompletedNotification() throws Exception { + // We've two steps in checkpoint: 1. snapshotState(ckp); 2. notifyCheckpointComplete(ckp). + // The Flink job should be able to restore from a checkpoint with only step#1 finished. + + // We cannot test a different checkpoint thant 0 because when using the OperatorTestHarness + // for recovery the lastCompleted checkpoint is always reset to 0. + // see: https://github.com/apache/iceberg/issues/10942 + long checkpointId = 0; + long timestamp = 0; + OperatorSubtaskState snapshot; + List expectedRows = Lists.newArrayList(); + + OperatorID operatorId; + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + harness = getTestHarness()) { + + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertSnapshotSize(0); + assertMaxCommittedCheckpointId(jobId, -1L); + + RowData row = SimpleDataUtil.createRowData(1, "hello"); + expectedRows.add(row); + DataFile dataFile = writeDataFile("data-1", ImmutableList.of(row)); + processElement(jobId, checkpointId, harness, 1, operatorId.toString(), dataFile); + + snapshot = harness.snapshot(++checkpointId, ++timestamp); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(), branch); + assertMaxCommittedCheckpointId(jobId, -1L); + assertFlinkManifests(1); + } + + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + harness = getTestHarness()) { + + harness.getStreamConfig().setOperatorID(operatorId); + harness.initializeState(snapshot); + harness.open(); + + // All flink manifests should be cleaned because it has committed the unfinished iceberg + // transaction. + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertMaxCommittedCheckpointId(jobId, operatorId.toString(), 0L); + + harness.snapshot(++checkpointId, ++timestamp); + // Did not write any new record, so it won't generate new manifest. + assertFlinkManifests(0); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertSnapshotSize(1); + + assertMaxCommittedCheckpointId(jobId, operatorId.toString(), 0); + + RowData row = SimpleDataUtil.createRowData(2, "world"); + expectedRows.add(row); + DataFile dataFile = writeDataFile("data-2", ImmutableList.of(row)); + processElement(jobId, checkpointId, harness, 1, operatorId.toString(), dataFile); + + snapshot = harness.snapshot(++checkpointId, ++timestamp); + assertFlinkManifests(1); + harness.notifyOfCompletedCheckpoint(checkpointId); + } + + // Redeploying flink job from external checkpoint. + JobID newJobId = new JobID(); + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + harness = getTestHarness()) { + harness.getStreamConfig().setOperatorID(operatorId); + harness.initializeState(snapshot); + harness.open(); + + // test harness has a limitation wherein it is not able to commit pending commits when + // initializeState is called, when the checkpointId > 0 + // so we have to call it explicitly + harness.notifyOfCompletedCheckpoint(checkpointId); + + // All flink manifests should be cleaned because it has committed the unfinished iceberg + // transaction. + assertFlinkManifests(0); + + assertMaxCommittedCheckpointId(newJobId.toString(), operatorId.toString(), -1); + assertMaxCommittedCheckpointId(jobId, operatorId.toString(), 2); + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertSnapshotSize(2); + + RowData row = SimpleDataUtil.createRowData(3, "foo"); + expectedRows.add(row); + DataFile dataFile = writeDataFile("data-3", ImmutableList.of(row)); + processElement( + newJobId.toString(), checkpointId, harness, 1, operatorId.toString(), dataFile); + + harness.snapshot(++checkpointId, ++timestamp); + assertFlinkManifests(1); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertSnapshotSize(3); + assertMaxCommittedCheckpointId(newJobId.toString(), operatorId.toString(), 3); + } + } + + @TestTemplate + public void testStartAnotherJobToWriteSameTable() throws Exception { + long checkpointId = 1; + long timestamp = 0; + + List rows = Lists.newArrayList(); + List tableRows = Lists.newArrayList(); + + JobID oldJobId = new JobID(); + OperatorID oldOperatorId; + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + harness = getTestHarness()) { + + harness.open(); + oldOperatorId = harness.getOperator().getOperatorID(); + + assertSnapshotSize(0); + assertMaxCommittedCheckpointId(oldJobId.toString(), oldOperatorId.toString(), -1L); + + for (int i = 1; i <= 3; i++) { + rows.add(SimpleDataUtil.createRowData(i, "hello" + i)); + tableRows.addAll(rows); + + DataFile dataFile = writeDataFile(String.format("data-%d", i), rows); + processElement( + oldJobId.toString(), ++checkpointId, harness, 1, oldOperatorId.toString(), dataFile); + harness.snapshot(checkpointId, ++timestamp); + assertFlinkManifests(1); + + harness.notifyOfCompletedCheckpoint(checkpointId); + + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, tableRows, branch); + assertSnapshotSize(i); + assertMaxCommittedCheckpointId(oldJobId.toString(), oldOperatorId.toString(), checkpointId); + } + } + + // The new started job will start with checkpoint = 1 again. + checkpointId = 1; + JobID newJobId = new JobID(); + OperatorID newOperatorId; + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + harness = getTestHarness()) { + harness.open(); + newOperatorId = harness.getOperator().getOperatorID(); + + assertSnapshotSize(3); + assertMaxCommittedCheckpointId(oldJobId.toString(), oldOperatorId.toString(), 4); + assertMaxCommittedCheckpointId(newJobId.toString(), newOperatorId.toString(), -1); + + rows.add(SimpleDataUtil.createRowData(2, "world")); + tableRows.addAll(rows); + + DataFile dataFile = writeDataFile("data-new-1", rows); + processElement( + newJobId.toString(), checkpointId, harness, 1, newOperatorId.toString(), dataFile); + harness.snapshot(checkpointId, ++timestamp); + assertFlinkManifests(1); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + SimpleDataUtil.assertTableRows(table, tableRows, branch); + assertSnapshotSize(4); + assertMaxCommittedCheckpointId(newJobId.toString(), newOperatorId.toString(), checkpointId); + } + } + + @TestTemplate + public void testMultipleJobsWriteSameTable() throws Exception { + long timestamp = 0; + List tableRows = Lists.newArrayList(); + + JobID[] jobs = new JobID[] {new JobID(), new JobID(), new JobID()}; + OperatorID[] operatorIds = + new OperatorID[] {new OperatorID(), new OperatorID(), new OperatorID()}; + for (int i = 0; i < 20; i++) { + int jobIndex = i % 3; + int checkpointId = i / 3; + JobID jobID = jobs[jobIndex]; + OperatorID operatorId = operatorIds[jobIndex]; + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + harness = getTestHarness()) { + harness.getStreamConfig().setOperatorID(operatorId); + + harness.open(); + + assertSnapshotSize(i); + assertMaxCommittedCheckpointId( + jobID.toString(), operatorId.toString(), checkpointId == 0 ? -1 : checkpointId - 1); + + List rows = Lists.newArrayList(SimpleDataUtil.createRowData(i, "word-" + i)); + tableRows.addAll(rows); + + DataFile dataFile = writeDataFile(String.format("data-%d", i), rows); + + processElement(jobID.toString(), checkpointId, harness, 1, operatorId.toString(), dataFile); + + harness.snapshot(checkpointId, ++timestamp); + assertFlinkManifests(1); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + SimpleDataUtil.assertTableRows(table, tableRows, branch); + assertSnapshotSize(i + 1); + assertMaxCommittedCheckpointId(jobID.toString(), operatorId.toString(), checkpointId); + } + } + } + + @TestTemplate + public void testMultipleSinksRecoveryFromValidSnapshot() throws Exception { + + // We cannot test a different checkpoint thant 0 because when using the OperatorTestHarness + // for recovery the lastCompleted checkpoint is always reset to 0. + // see: https://github.com/apache/iceberg/issues/10942 + long checkpointId = 0; + long timestamp = 0; + List expectedRows = Lists.newArrayList(); + OperatorSubtaskState snapshot1; + OperatorSubtaskState snapshot2; + + JobID jobID = new JobID(); + OperatorID operatorId1 = new OperatorID(); + OperatorID operatorId2 = new OperatorID(); + + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + harness1 = getTestHarness()) { + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + harness2 = getTestHarness()) { + harness1.getStreamConfig().setOperatorID(operatorId1); + harness1.setup(); + harness1.open(); + harness2.getStreamConfig().setOperatorID(operatorId2); + harness2.setup(); + harness2.open(); + + assertSnapshotSize(0); + assertMaxCommittedCheckpointId(jobID.toString(), operatorId1.toString(), -1L); + assertMaxCommittedCheckpointId(jobID.toString(), operatorId2.toString(), -1L); + + RowData row1 = SimpleDataUtil.createRowData(1, "hello1"); + expectedRows.add(row1); + DataFile dataFile1 = writeDataFile("data-1-1", ImmutableList.of(row1)); + processElement( + jobID.toString(), checkpointId, harness1, 1, operatorId1.toString(), dataFile1); + + snapshot1 = harness1.snapshot(checkpointId, ++timestamp); + + RowData row2 = SimpleDataUtil.createRowData(1, "hello2"); + expectedRows.add(row2); + DataFile dataFile2 = writeDataFile("data-1-2", ImmutableList.of(row2)); + processElement( + jobID.toString(), checkpointId, harness2, 1, operatorId2.toString(), dataFile2); + + snapshot2 = harness2.snapshot(checkpointId, ++timestamp); + assertFlinkManifests(2); + + // Only notify one of the committers + harness1.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(1); + + // Only the first row is committed at this point + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); + assertSnapshotSize(1); + assertMaxCommittedCheckpointId(jobID.toString(), operatorId1.toString(), checkpointId); + assertMaxCommittedCheckpointId(jobID.toString(), operatorId2.toString(), -1); + } + } + + // Restore from the given snapshot + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + harness1 = getTestHarness(); + OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + harness2 = getTestHarness()) { + harness1.getStreamConfig().setOperatorID(operatorId1); + harness1.setup(); + harness1.initializeState(snapshot1); + harness1.open(); + + harness2.getStreamConfig().setOperatorID(operatorId2); + harness2.setup(); + harness2.initializeState(snapshot2); + harness2.open(); + + // All flink manifests should be cleaned because it has committed the unfinished iceberg + // transaction. + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertSnapshotSize(2); + assertMaxCommittedCheckpointId(jobID.toString(), operatorId1.toString(), checkpointId); + assertMaxCommittedCheckpointId(jobID.toString(), operatorId2.toString(), checkpointId); + + RowData row1 = SimpleDataUtil.createRowData(2, "world1"); + expectedRows.add(row1); + DataFile dataFile1 = writeDataFile("data-2-1", ImmutableList.of(row1)); + + checkpointId++; + processElement( + jobID.toString(), checkpointId, harness1, 1, operatorId1.toString(), dataFile1); + + harness1.snapshot(checkpointId, ++timestamp); + + RowData row2 = SimpleDataUtil.createRowData(2, "world2"); + expectedRows.add(row2); + DataFile dataFile2 = writeDataFile("data-2-2", ImmutableList.of(row2)); + processElement( + jobID.toString(), checkpointId, harness2, 1, operatorId2.toString(), dataFile2); + + harness2.snapshot(checkpointId, ++timestamp); + + assertFlinkManifests(2); + + harness1.notifyOfCompletedCheckpoint(checkpointId); + harness2.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertSnapshotSize(4); + assertMaxCommittedCheckpointId(jobID.toString(), operatorId1.toString(), checkpointId); + assertMaxCommittedCheckpointId(jobID.toString(), operatorId2.toString(), checkpointId); + } + } + + @TestTemplate + public void testFlinkManifests() throws Exception { + long timestamp = 0; + long checkpoint = 1; + + JobID jobID = new JobID(); + OperatorID operatorId; + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + harness = getTestHarness()) { + + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertMaxCommittedCheckpointId(jobID.toString(), operatorId.toString(), -1L); + + RowData row1 = SimpleDataUtil.createRowData(1, "hello"); + DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); + + // harness.processElement(of(dataFile1), ++timestamp); + processElement(jobID.toString(), checkpoint, harness, 1, operatorId.toString(), dataFile1); + + assertMaxCommittedCheckpointId(jobID.toString(), operatorId.toString(), -1L); + + // 1. snapshotState for checkpoint#1 + harness.snapshot(checkpoint, ++timestamp); + List manifestPaths = assertFlinkManifests(1); + Path manifestPath = manifestPaths.get(0); + assertThat(manifestPath.getFileName()) + .asString() + .isEqualTo( + String.format("%s-%s-%05d-%d-%d-%05d.avro", jobID, operatorId, 0, 0, checkpoint, 1)); + // + // 2. Read the data files from manifests and assert. + List dataFiles = + FlinkManifestUtil.readDataFiles( + createTestingManifestFile(manifestPath, dataFile1), table.io(), table.specs()); + assertThat(dataFiles).hasSize(1); + TestHelpers.assertEquals(dataFile1, dataFiles.get(0)); + + // 3. notifyCheckpointComplete for checkpoint#1 + harness.notifyOfCompletedCheckpoint(checkpoint); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); + assertMaxCommittedCheckpointId(jobID.toString(), operatorId.toString(), checkpoint); + assertFlinkManifests(0); + } + } + + @TestTemplate + public void testHandleEndInput() throws Exception { + assumeThat(isStreamingMode).as("Only support batch mode").isFalse(); + + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + testHarness = getTestHarness()) { + + testHarness.open(); + + long checkpointId = Long.MAX_VALUE; + processElement(jobId, checkpointId, testHarness, 1, OPERATOR_ID, dataFileTest1); + + testHarness.endInput(); + + assertMaxCommittedCheckpointId(jobId, OPERATOR_ID, Long.MAX_VALUE); + + List output = transformsToStreamElement(testHarness.getOutput()); + assertThat(output).hasSize(2); + + SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output.get(0))) + .hasCheckpointId(checkpointId) + .hasOverallCommittables(1) + .hasFailedCommittables(0); + + // endInput is idempotent + testHarness.endInput(); + assertThat(testHarness.getOutput()).hasSize(2); + } + } + + @TestTemplate + public void testDeleteFiles() throws Exception { + + assumeThat(formatVersion).as("Only support delete in format v2").isGreaterThanOrEqualTo(2); + + FileAppenderFactory appenderFactory = createDeletableAppenderFactory(); + + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + testHarness = getTestHarness()) { + + testHarness.open(); + + long checkpointId = 1; + RowData row1 = SimpleDataUtil.createInsert(1, "aaa"); + DataFile dataFile1 = writeDataFile("data-file-1", ImmutableList.of(row1)); + processElement(jobId, checkpointId, testHarness, 1, OPERATOR_ID, dataFile1); + + // testHarness.snapshot(checkpointId, 0); + testHarness.notifyOfCompletedCheckpoint(checkpointId); + + assertSnapshotSize(1); + assertMaxCommittedCheckpointId(jobId, checkpointId); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); + + List output = transformsToStreamElement(testHarness.getOutput()); + assertThat(output).hasSize(2); + SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output.get(0))) + .hasCheckpointId(checkpointId) + .hasOverallCommittables(1) + .hasFailedCommittables(0); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); + + // The 2. commit + checkpointId = 2; + RowData row2 = SimpleDataUtil.createInsert(2, "bbb"); + DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(row2)); + + RowData row3 = SimpleDataUtil.createInsert(3, "ccc"); + DataFile dataFile3 = writeDataFile("data-file-3", ImmutableList.of(row3)); + processElement(jobId, checkpointId, testHarness, 2, OPERATOR_ID, dataFile2, dataFile3); + + // testHarness.snapshot(checkpointId, 1); + testHarness.notifyOfCompletedCheckpoint(checkpointId); + + assertSnapshotSize(2); + assertMaxCommittedCheckpointId(jobId, checkpointId); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2, row3), branch); + + List output2 = transformsToStreamElement(testHarness.getOutput()); + assertThat(output2).hasSize(2 + 2); + SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output2.get(2))) + .hasCheckpointId(checkpointId) + .hasOverallCommittables(1) + .hasFailedCommittables(0); + + // The 3. commit + checkpointId = 3; + RowData delete1 = SimpleDataUtil.createDelete(1, "aaa"); + DeleteFile deleteFile1 = + writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete1)); + RowData row4 = SimpleDataUtil.createInsert(4, "ddd"); + DataFile dataFile4 = writeDataFile("data-file-4", ImmutableList.of(row4)); + + RowData row5 = SimpleDataUtil.createInsert(5, "eee"); + DataFile dataFile5 = writeDataFile("data-file-5", ImmutableList.of(row5)); + WriteResult withRecord4 = + WriteResult.builder() + .addDataFiles(dataFile4, dataFile5) + .addDeleteFiles(deleteFile1) + .build(); + processElement(withRecord4, jobId, checkpointId, testHarness, 2, OPERATOR_ID); + + // testHarness.snapshot(checkpointId, 3); + testHarness.notifyOfCompletedCheckpoint(checkpointId); + + assertSnapshotSize(3); + assertMaxCommittedCheckpointId(jobId, checkpointId); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row2, row3, row4, row5), branch); + + List output3 = transformsToStreamElement(testHarness.getOutput()); + assertThat(output3).hasSize(2 + 2 + 2); + SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output3.get(4))) + .hasCheckpointId(checkpointId) + .hasOverallCommittables(1) + .hasFailedCommittables(0); + } + } + + private ManifestFile createTestingManifestFile(Path manifestPath, DataFile dataFile) + throws IOException { + ManifestWriter writer = + ManifestFiles.write( + formatVersion, + PartitionSpec.unpartitioned(), + table.io().newOutputFile(manifestPath.toString()), + 0L); + writer.add(dataFile); + writer.close(); + return writer.toManifestFile(); + } + + private IcebergWriteAggregator buildIcebergWriteAggregator(String myJobId, String operatorId) { + IcebergWriteAggregator icebergWriteAggregator = spy(new IcebergWriteAggregator(tableLoader)); + StreamTask ctx = mock(StreamTask.class); + Environment env = mock(Environment.class); + StreamingRuntimeContext streamingRuntimeContext = mock(StreamingRuntimeContext.class); + TaskInfo taskInfo = mock(TaskInfo.class); + JobID myJobID = mock(JobID.class); + OperatorID operatorID = mock(OperatorID.class); + doReturn(myJobId).when(myJobID).toString(); + doReturn(myJobID).when(env).getJobID(); + doReturn(env).when(ctx).getEnvironment(); + doReturn(ctx).when(icebergWriteAggregator).getContainingTask(); + doReturn(operatorId).when(operatorID).toString(); + doReturn(operatorID).when(icebergWriteAggregator).getOperatorID(); + doReturn(0).when(taskInfo).getAttemptNumber(); + doReturn(taskInfo).when(streamingRuntimeContext).getTaskInfo(); + doReturn(streamingRuntimeContext).when(icebergWriteAggregator).getRuntimeContext(); + + try { + icebergWriteAggregator.open(); + } catch (Exception e) { + throw new RuntimeException(e); + } + return icebergWriteAggregator; + } + + private CommittableSummary processElement( + WriteResult withRecord, + String myJobId, + long checkpointId, + OneInputStreamOperatorTestHarness testHarness, + int subTaskId, + String operatorId) + throws Exception { + + IcebergCommittable commit = + new IcebergCommittable( + buildIcebergWriteAggregator(myJobId, operatorId) + .writeToManifest(Lists.newArrayList(withRecord), checkpointId), + myJobId, + operatorId, + checkpointId); + + CommittableSummary committableSummary = + new CommittableSummary<>(subTaskId, 1, checkpointId, 1, 1, 0); + testHarness.processElement(new StreamRecord<>(committableSummary)); + + CommittableWithLineage committable = + new CommittableWithLineage<>(commit, checkpointId, subTaskId); + testHarness.processElement(new StreamRecord<>(committable)); + + return committableSummary; + } + + private CommittableSummary processElement( + String myJobID, + long checkpointId, + OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + testHarness, + int subTaskId, + String operatorId, + DataFile... dataFile) + throws Exception { + WriteResult withRecord = WriteResult.builder().addDataFiles(dataFile).build(); + return processElement(withRecord, myJobID, checkpointId, testHarness, subTaskId, operatorId); + } + + private FileAppenderFactory createDeletableAppenderFactory() { + int[] equalityFieldIds = + new int[] { + table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId() + }; + return new FlinkAppenderFactory( + table, + table.schema(), + FlinkSchemaUtil.convert(table.schema()), + table.properties(), + table.spec(), + equalityFieldIds, + table.schema(), + null); + } + + private List assertFlinkManifests(int expectedCount) throws IOException { + List manifests = + Files.list(flinkManifestFolder.toPath()) + .filter(p -> !p.toString().endsWith(".crc")) + .collect(Collectors.toList()); + assertThat(manifests).hasSize(expectedCount); + return manifests; + } + + private DataFile writeDataFile(String filename, List rows) throws IOException { + return SimpleDataUtil.writeFile( + table, + table.schema(), + table.spec(), + new Configuration(), + table.location(), + FileFormat.PARQUET.addExtension(filename), + rows); + } + + private DeleteFile writeEqDeleteFile( + FileAppenderFactory appenderFactory, String filename, List deletes) + throws IOException { + return SimpleDataUtil.writeEqDeleteFile( + table, FileFormat.PARQUET, filename, appenderFactory, deletes); + } + + private OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + getTestHarness() throws Exception { + IcebergSink sink = + IcebergSink.forRowData(null).table(table).toBranch(branch).tableLoader(tableLoader).build(); + + OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + testHarness = + new OneInputStreamOperatorTestHarness<>( + new CommitterOperatorFactory<>(sink, !isStreamingMode, true)); + testHarness.setup(committableMessageTypeSerializer); + return testHarness; + } + + // ------------------------------- Utility Methods -------------------------------- + + private IcebergCommitter getCommitter() { + IcebergFilesCommitterMetrics metric = mock(IcebergFilesCommitterMetrics.class); + return new IcebergCommitter( + tableLoader, + branch, + Collections.singletonMap("flink.test", TestIcebergCommitter.class.getName()), + false, + 10, + "sinkId", + metric, + false); + } + + private Committer.CommitRequest buildCommitRequestFor( + String myJobID, long checkpoint, Collection writeResults) throws IOException { + IcebergCommittable commit = + new IcebergCommittable( + buildIcebergWriteAggregator(myJobID, OPERATOR_ID) + .writeToManifest(writeResults, checkpoint), + myJobID, + OPERATOR_ID, + checkpoint); + + CommittableWithLineage committableWithLineage = + new CommittableWithLineage(commit, checkpoint, 1); + Committer.CommitRequest commitRequest = mock(Committer.CommitRequest.class); + + doReturn(committableWithLineage.getCommittable()).when(commitRequest).getCommittable(); + + return commitRequest; + } + + private WriteResult of(DataFile dataFile) { + return WriteResult.builder().addDataFiles(dataFile).build(); + } + + private void assertMaxCommittedCheckpointId(String myJobID, String operatorId, long expectedId) { + table.refresh(); + long actualId = SinkUtil.getMaxCommittedCheckpointId(table, myJobID, operatorId, branch); + assertThat(actualId).isEqualTo(expectedId); + } + + private void assertMaxCommittedCheckpointId(String myJobID, long expectedId) { + assertMaxCommittedCheckpointId(myJobID, OPERATOR_ID, expectedId); + } + + private void assertSnapshotSize(int expectedSnapshotSize) { + table.refresh(); + assertThat(table.snapshots()).hasSize(expectedSnapshotSize); + } + + private static ByteBuffer longToBuffer(long value) { + return ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN).putLong(0, value); + } + + private static class TestCommittableMessageTypeSerializer + extends TypeSerializer> { + + CommittableMessageSerializer serializer = + new CommittableMessageSerializer<>(new IcebergCommittableSerializer()); + + @Override + public boolean isImmutableType() { + return false; + } + + @Override + public TypeSerializer> duplicate() { + return null; + } + + @Override + public CommittableMessage createInstance() { + return null; + } + + @Override + public CommittableMessage copy( + CommittableMessage from) { + return from; + } + + @Override + public CommittableMessage copy( + CommittableMessage from, CommittableMessage reuse) { + return from; + } + + @Override + public int getLength() { + return 0; + } + + @Override + public void serialize(CommittableMessage record, DataOutputView target) + throws IOException { + byte[] serialize = serializer.serialize(record); + target.writeInt(serialize.length); + target.write(serialize); + } + + @Override + public CommittableMessage deserialize(DataInputView source) + throws IOException { + int length = source.readInt(); + byte[] bytes = new byte[length]; + source.read(bytes); + return serializer.deserialize(1, bytes); + } + + @Override + public CommittableMessage deserialize( + CommittableMessage reuse, DataInputView source) throws IOException { + return deserialize(source); + } + + @Override + public void copy(DataInputView source, DataOutputView target) throws IOException { + CommittableMessage deserialize = deserialize(source); + serialize(deserialize, target); + } + + @Override + public boolean equals(Object obj) { + return false; + } + + @Override + public int hashCode() { + return 0; + } + + @Override + public TypeSerializerSnapshot> snapshotConfiguration() { + return null; + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java new file mode 100644 index 000000000000..65fb9b8f69b4 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java @@ -0,0 +1,1238 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.flink.sink.IcebergFilesCommitter.MAX_CONTINUOUS_EMPTY_COMMITS; +import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.NavigableMap; +import java.util.SortedMap; +import java.util.stream.Collectors; +import org.apache.flink.api.common.ExecutionConfig; +import org.apache.flink.api.common.JobID; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.OperatorStateStore; +import org.apache.flink.core.io.SimpleVersionedSerialization; +import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.operators.testutils.MockEnvironment; +import org.apache.flink.runtime.operators.testutils.MockEnvironmentBuilder; +import org.apache.flink.runtime.operators.testutils.MockInputSplitProvider; +import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.StreamOperator; +import org.apache.flink.streaming.api.operators.StreamOperatorParameters; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.table.data.RowData; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.ManifestFiles; +import org.apache.iceberg.ManifestWriter; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionData; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.TestBase; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.flink.TestTableLoader; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.Pair; +import org.apache.iceberg.util.ThreadPools; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestIcebergFilesCommitter extends TestBase { + private static final Configuration CONF = new Configuration(); + + private File flinkManifestFolder; + + @Parameter(index = 1) + private FileFormat format; + + @Parameter(index = 2) + private String branch; + + @Parameters(name = "formatVersion = {0}, fileFormat = {1}, branch = {2}") + protected static List parameters() { + return Arrays.asList( + new Object[] {1, FileFormat.AVRO, "main"}, + new Object[] {2, FileFormat.AVRO, "test-branch"}, + new Object[] {1, FileFormat.PARQUET, "main"}, + new Object[] {2, FileFormat.PARQUET, "test-branch"}, + new Object[] {1, FileFormat.ORC, "main"}, + new Object[] {2, FileFormat.ORC, "test-branch"}); + } + + @Override + @BeforeEach + public void setupTable() throws IOException { + flinkManifestFolder = Files.createTempDirectory(temp, "flink").toFile(); + this.metadataDir = new File(tableDir, "metadata"); + + // Construct the iceberg table. + table = create(SimpleDataUtil.SCHEMA, PartitionSpec.unpartitioned()); + + table + .updateProperties() + .set(DEFAULT_FILE_FORMAT, format.name()) + .set(FLINK_MANIFEST_LOCATION, flinkManifestFolder.getAbsolutePath()) + .set(MAX_CONTINUOUS_EMPTY_COMMITS, "1") + .commit(); + } + + @TestTemplate + public void testCommitTxnWithoutDataFiles() throws Exception { + long checkpointId = 0; + long timestamp = 0; + JobID jobId = new JobID(); + OperatorID operatorId; + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + SimpleDataUtil.assertTableRows(table, Lists.newArrayList(), branch); + assertSnapshotSize(0); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + // It's better to advance the max-committed-checkpoint-id in iceberg snapshot, so that the + // future flink job failover won't fail. + for (int i = 1; i <= 3; i++) { + harness.snapshot(++checkpointId, ++timestamp); + assertFlinkManifests(0); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + + assertSnapshotSize(i); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + } + } + } + + @TestTemplate + public void testMaxContinuousEmptyCommits() throws Exception { + table.updateProperties().set(MAX_CONTINUOUS_EMPTY_COMMITS, "3").commit(); + + JobID jobId = new JobID(); + long checkpointId = 0; + long timestamp = 0; + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(jobId)) { + harness.setup(); + harness.open(); + + assertSnapshotSize(0); + + for (int i = 1; i <= 9; i++) { + harness.snapshot(++checkpointId, ++timestamp); + harness.notifyOfCompletedCheckpoint(checkpointId); + + assertSnapshotSize(i / 3); + } + } + } + + private FlinkWriteResult of(long checkpointId, DataFile dataFile) { + return new FlinkWriteResult(checkpointId, WriteResult.builder().addDataFiles(dataFile).build()); + } + + @TestTemplate + public void testCommitTxn() throws Exception { + // Test with 3 continues checkpoints: + // 1. snapshotState for checkpoint#1 + // 2. notifyCheckpointComplete for checkpoint#1 + // 3. snapshotState for checkpoint#2 + // 4. notifyCheckpointComplete for checkpoint#2 + // 5. snapshotState for checkpoint#3 + // 6. notifyCheckpointComplete for checkpoint#3 + long timestamp = 0; + + JobID jobID = new JobID(); + OperatorID operatorId; + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(jobID)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertSnapshotSize(0); + + List rows = Lists.newArrayListWithExpectedSize(3); + for (int i = 1; i <= 3; i++) { + RowData rowData = SimpleDataUtil.createRowData(i, "hello" + i); + DataFile dataFile = writeDataFile("data-" + i, ImmutableList.of(rowData)); + harness.processElement(of(i, dataFile), ++timestamp); + rows.add(rowData); + + harness.snapshot(i, ++timestamp); + assertFlinkManifests(1); + + harness.notifyOfCompletedCheckpoint(i); + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, ImmutableList.copyOf(rows), branch); + assertSnapshotSize(i); + assertMaxCommittedCheckpointId(jobID, operatorId, i); + assertThat(SimpleDataUtil.latestSnapshot(table, branch).summary()) + .containsEntry("flink.test", TestIcebergFilesCommitter.class.getName()); + } + } + } + + @TestTemplate + public void testOrderedEventsBetweenCheckpoints() throws Exception { + // It's possible that two checkpoints happen in the following orders: + // 1. snapshotState for checkpoint#1; + // 2. snapshotState for checkpoint#2; + // 3. notifyCheckpointComplete for checkpoint#1; + // 4. notifyCheckpointComplete for checkpoint#2; + long timestamp = 0; + + JobID jobId = new JobID(); + OperatorID operatorId; + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + RowData row1 = SimpleDataUtil.createRowData(1, "hello"); + DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); + + long firstCheckpointId = 1; + harness.processElement(of(firstCheckpointId, dataFile1), ++timestamp); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + // 1. snapshotState for checkpoint#1 + harness.snapshot(firstCheckpointId, ++timestamp); + assertFlinkManifests(1); + + RowData row2 = SimpleDataUtil.createRowData(2, "world"); + DataFile dataFile2 = writeDataFile("data-2", ImmutableList.of(row2)); + long secondCheckpointId = 2; + harness.processElement(of(secondCheckpointId, dataFile2), ++timestamp); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + // 2. snapshotState for checkpoint#2 + harness.snapshot(secondCheckpointId, ++timestamp); + assertFlinkManifests(2); + + // 3. notifyCheckpointComplete for checkpoint#1 + harness.notifyOfCompletedCheckpoint(firstCheckpointId); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); + assertMaxCommittedCheckpointId(jobId, operatorId, firstCheckpointId); + assertFlinkManifests(1); + + // 4. notifyCheckpointComplete for checkpoint#2 + harness.notifyOfCompletedCheckpoint(secondCheckpointId); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2), branch); + assertMaxCommittedCheckpointId(jobId, operatorId, secondCheckpointId); + assertFlinkManifests(0); + } + } + + @TestTemplate + public void testDisorderedEventsBetweenCheckpoints() throws Exception { + // It's possible that the two checkpoints happen in the following orders: + // 1. snapshotState for checkpoint#1; + // 2. snapshotState for checkpoint#2; + // 3. notifyCheckpointComplete for checkpoint#2; + // 4. notifyCheckpointComplete for checkpoint#1; + long timestamp = 0; + + JobID jobId = new JobID(); + OperatorID operatorId; + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + RowData row1 = SimpleDataUtil.createRowData(1, "hello"); + DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); + + long firstCheckpointId = 1; + harness.processElement(of(firstCheckpointId, dataFile1), ++timestamp); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + // 1. snapshotState for checkpoint#1 + harness.snapshot(firstCheckpointId, ++timestamp); + assertFlinkManifests(1); + + RowData row2 = SimpleDataUtil.createRowData(2, "world"); + DataFile dataFile2 = writeDataFile("data-2", ImmutableList.of(row2)); + long secondCheckpointId = 2; + harness.processElement(of(secondCheckpointId, dataFile2), ++timestamp); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + // 2. snapshotState for checkpoint#2 + harness.snapshot(secondCheckpointId, ++timestamp); + assertFlinkManifests(2); + + // 3. notifyCheckpointComplete for checkpoint#2 + harness.notifyOfCompletedCheckpoint(secondCheckpointId); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2), branch); + assertMaxCommittedCheckpointId(jobId, operatorId, secondCheckpointId); + assertFlinkManifests(0); + + // 4. notifyCheckpointComplete for checkpoint#1 + harness.notifyOfCompletedCheckpoint(firstCheckpointId); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2), branch); + assertMaxCommittedCheckpointId(jobId, operatorId, secondCheckpointId); + assertFlinkManifests(0); + } + } + + @TestTemplate + public void testRecoveryFromValidSnapshot() throws Exception { + long checkpointId = 0; + long timestamp = 0; + List expectedRows = Lists.newArrayList(); + OperatorSubtaskState snapshot; + + JobID jobId = new JobID(); + OperatorID operatorId; + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertSnapshotSize(0); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + RowData row = SimpleDataUtil.createRowData(1, "hello"); + expectedRows.add(row); + DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row)); + + harness.processElement(of(++checkpointId, dataFile1), ++timestamp); + snapshot = harness.snapshot(checkpointId, ++timestamp); + assertFlinkManifests(1); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row), branch); + assertSnapshotSize(1); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + } + + // Restore from the given snapshot + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(jobId)) { + harness.getStreamConfig().setOperatorID(operatorId); + harness.setup(); + harness.initializeState(snapshot); + harness.open(); + + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertSnapshotSize(1); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + + RowData row = SimpleDataUtil.createRowData(2, "world"); + expectedRows.add(row); + DataFile dataFile = writeDataFile("data-2", ImmutableList.of(row)); + harness.processElement(of(++checkpointId, dataFile), ++timestamp); + + harness.snapshot(checkpointId, ++timestamp); + assertFlinkManifests(1); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertSnapshotSize(2); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + } + } + + @TestTemplate + public void testRecoveryFromSnapshotWithoutCompletedNotification() throws Exception { + // We've two steps in checkpoint: 1. snapshotState(ckp); 2. notifyCheckpointComplete(ckp). It's + // possible that we + // flink job will restore from a checkpoint with only step#1 finished. + long checkpointId = 0; + long timestamp = 0; + OperatorSubtaskState snapshot; + List expectedRows = Lists.newArrayList(); + JobID jobId = new JobID(); + OperatorID operatorId; + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertSnapshotSize(0); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + RowData row = SimpleDataUtil.createRowData(1, "hello"); + expectedRows.add(row); + DataFile dataFile = writeDataFile("data-1", ImmutableList.of(row)); + harness.processElement(of(++checkpointId, dataFile), ++timestamp); + + snapshot = harness.snapshot(checkpointId, ++timestamp); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(), branch); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + assertFlinkManifests(1); + } + + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(jobId)) { + harness.getStreamConfig().setOperatorID(operatorId); + harness.setup(); + harness.initializeState(snapshot); + harness.open(); + + // All flink manifests should be cleaned because it has committed the unfinished iceberg + // transaction. + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + + harness.snapshot(++checkpointId, ++timestamp); + // Did not write any new record, so it won't generate new manifest. + assertFlinkManifests(0); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertSnapshotSize(2); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + + RowData row = SimpleDataUtil.createRowData(2, "world"); + expectedRows.add(row); + DataFile dataFile = writeDataFile("data-2", ImmutableList.of(row)); + harness.processElement(of(++checkpointId, dataFile), ++timestamp); + + snapshot = harness.snapshot(checkpointId, ++timestamp); + assertFlinkManifests(1); + } + + // Redeploying flink job from external checkpoint. + JobID newJobId = new JobID(); + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(newJobId)) { + harness.setup(); + harness.initializeState(snapshot); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + // All flink manifests should be cleaned because it has committed the unfinished iceberg + // transaction. + assertFlinkManifests(0); + + assertMaxCommittedCheckpointId(newJobId, operatorId, -1); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertSnapshotSize(3); + + RowData row = SimpleDataUtil.createRowData(3, "foo"); + expectedRows.add(row); + DataFile dataFile = writeDataFile("data-3", ImmutableList.of(row)); + harness.processElement(of(++checkpointId, dataFile), ++timestamp); + + harness.snapshot(checkpointId, ++timestamp); + assertFlinkManifests(1); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertSnapshotSize(4); + assertMaxCommittedCheckpointId(newJobId, operatorId, checkpointId); + } + } + + @TestTemplate + public void testStartAnotherJobToWriteSameTable() throws Exception { + long checkpointId = 0; + long timestamp = 0; + List rows = Lists.newArrayList(); + List tableRows = Lists.newArrayList(); + + JobID oldJobId = new JobID(); + OperatorID oldOperatorId; + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(oldJobId)) { + harness.setup(); + harness.open(); + oldOperatorId = harness.getOperator().getOperatorID(); + + assertSnapshotSize(0); + assertMaxCommittedCheckpointId(oldJobId, oldOperatorId, -1L); + + for (int i = 1; i <= 3; i++) { + rows.add(SimpleDataUtil.createRowData(i, "hello" + i)); + tableRows.addAll(rows); + + DataFile dataFile = writeDataFile(String.format("data-%d", i), rows); + harness.processElement(of(++checkpointId, dataFile), ++timestamp); + harness.snapshot(checkpointId, ++timestamp); + assertFlinkManifests(1); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, tableRows, branch); + assertSnapshotSize(i); + assertMaxCommittedCheckpointId(oldJobId, oldOperatorId, checkpointId); + } + } + + // The new started job will start with checkpoint = 1 again. + checkpointId = 0; + timestamp = 0; + JobID newJobId = new JobID(); + OperatorID newOperatorId; + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(newJobId)) { + harness.setup(); + harness.open(); + newOperatorId = harness.getOperator().getOperatorID(); + + assertSnapshotSize(3); + assertMaxCommittedCheckpointId(oldJobId, oldOperatorId, 3); + assertMaxCommittedCheckpointId(newJobId, newOperatorId, -1); + + rows.add(SimpleDataUtil.createRowData(2, "world")); + tableRows.addAll(rows); + + DataFile dataFile = writeDataFile("data-new-1", rows); + harness.processElement(of(++checkpointId, dataFile), ++timestamp); + harness.snapshot(checkpointId, ++timestamp); + assertFlinkManifests(1); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + SimpleDataUtil.assertTableRows(table, tableRows, branch); + assertSnapshotSize(4); + assertMaxCommittedCheckpointId(newJobId, newOperatorId, checkpointId); + } + } + + @TestTemplate + public void testMultipleJobsWriteSameTable() throws Exception { + long timestamp = 0; + List tableRows = Lists.newArrayList(); + + JobID[] jobs = new JobID[] {new JobID(), new JobID(), new JobID()}; + OperatorID[] operatorIds = + new OperatorID[] {new OperatorID(), new OperatorID(), new OperatorID()}; + for (int i = 0; i < 20; i++) { + int jobIndex = i % 3; + int checkpointId = i / 3; + JobID jobId = jobs[jobIndex]; + OperatorID operatorId = operatorIds[jobIndex]; + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(jobId)) { + harness.getStreamConfig().setOperatorID(operatorId); + harness.setup(); + harness.open(); + + assertSnapshotSize(i); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId == 0 ? -1 : checkpointId); + + List rows = Lists.newArrayList(SimpleDataUtil.createRowData(i, "word-" + i)); + tableRows.addAll(rows); + + DataFile dataFile = writeDataFile(String.format("data-%d", i), rows); + harness.processElement(of(checkpointId + 1, dataFile), ++timestamp); + harness.snapshot(checkpointId + 1, ++timestamp); + assertFlinkManifests(1); + + harness.notifyOfCompletedCheckpoint(checkpointId + 1); + assertFlinkManifests(0); + SimpleDataUtil.assertTableRows(table, tableRows, branch); + assertSnapshotSize(i + 1); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId + 1); + } + } + } + + @TestTemplate + public void testMultipleSinksRecoveryFromValidSnapshot() throws Exception { + long checkpointId = 0; + long timestamp = 0; + List expectedRows = Lists.newArrayList(); + OperatorSubtaskState snapshot1; + OperatorSubtaskState snapshot2; + + JobID jobId = new JobID(); + OperatorID operatorId1 = new OperatorID(); + OperatorID operatorId2 = new OperatorID(); + try (OneInputStreamOperatorTestHarness harness1 = + createStreamSink(jobId); + OneInputStreamOperatorTestHarness harness2 = + createStreamSink(jobId)) { + harness1.getStreamConfig().setOperatorID(operatorId1); + harness1.setup(); + harness1.open(); + harness2.getStreamConfig().setOperatorID(operatorId2); + harness2.setup(); + harness2.open(); + + assertSnapshotSize(0); + assertMaxCommittedCheckpointId(jobId, operatorId1, -1L); + assertMaxCommittedCheckpointId(jobId, operatorId2, -1L); + + RowData row1 = SimpleDataUtil.createRowData(1, "hello1"); + expectedRows.add(row1); + DataFile dataFile1 = writeDataFile("data-1-1", ImmutableList.of(row1)); + + harness1.processElement(of(++checkpointId, dataFile1), ++timestamp); + snapshot1 = harness1.snapshot(checkpointId, ++timestamp); + + RowData row2 = SimpleDataUtil.createRowData(1, "hello2"); + expectedRows.add(row2); + DataFile dataFile2 = writeDataFile("data-1-2", ImmutableList.of(row2)); + + harness2.processElement(of(checkpointId, dataFile2), ++timestamp); + snapshot2 = harness2.snapshot(checkpointId, ++timestamp); + assertFlinkManifests(2); + + // Only notify one of the committers + harness1.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(1); + + // Only the first row is committed at this point + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); + assertSnapshotSize(1); + assertMaxCommittedCheckpointId(jobId, operatorId1, checkpointId); + assertMaxCommittedCheckpointId(jobId, operatorId2, -1); + } + + // Restore from the given snapshot + try (OneInputStreamOperatorTestHarness harness1 = + createStreamSink(jobId); + OneInputStreamOperatorTestHarness harness2 = + createStreamSink(jobId)) { + harness1.getStreamConfig().setOperatorID(operatorId1); + harness1.setup(); + harness1.initializeState(snapshot1); + harness1.open(); + + harness2.getStreamConfig().setOperatorID(operatorId2); + harness2.setup(); + harness2.initializeState(snapshot2); + harness2.open(); + + // All flink manifests should be cleaned because it has committed the unfinished iceberg + // transaction. + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertSnapshotSize(2); + assertMaxCommittedCheckpointId(jobId, operatorId1, checkpointId); + assertMaxCommittedCheckpointId(jobId, operatorId2, checkpointId); + + RowData row1 = SimpleDataUtil.createRowData(2, "world1"); + expectedRows.add(row1); + DataFile dataFile1 = writeDataFile("data-2-1", ImmutableList.of(row1)); + + harness1.processElement(of(++checkpointId, dataFile1), ++timestamp); + harness1.snapshot(checkpointId, ++timestamp); + + RowData row2 = SimpleDataUtil.createRowData(2, "world2"); + expectedRows.add(row2); + DataFile dataFile2 = writeDataFile("data-2-2", ImmutableList.of(row2)); + harness2.processElement(of(checkpointId, dataFile2), ++timestamp); + harness2.snapshot(checkpointId, ++timestamp); + + assertFlinkManifests(2); + + harness1.notifyOfCompletedCheckpoint(checkpointId); + harness2.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertSnapshotSize(4); + assertMaxCommittedCheckpointId(jobId, operatorId1, checkpointId); + assertMaxCommittedCheckpointId(jobId, operatorId2, checkpointId); + } + } + + @TestTemplate + public void testBoundedStream() throws Exception { + JobID jobId = new JobID(); + OperatorID operatorId; + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertFlinkManifests(0); + assertSnapshotSize(0); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + List tableRows = Lists.newArrayList(SimpleDataUtil.createRowData(1, "word-1")); + + DataFile dataFile = writeDataFile("data-1", tableRows); + harness.processElement(of(IcebergStreamWriter.END_INPUT_CHECKPOINT_ID, dataFile), 1); + ((BoundedOneInput) harness.getOneInputOperator()).endInput(); + + assertFlinkManifests(0); + SimpleDataUtil.assertTableRows(table, tableRows, branch); + assertSnapshotSize(1); + assertMaxCommittedCheckpointId( + jobId, operatorId, IcebergStreamWriter.END_INPUT_CHECKPOINT_ID); + assertThat(SimpleDataUtil.latestSnapshot(table, branch).summary()) + .containsEntry("flink.test", TestIcebergFilesCommitter.class.getName()); + } + } + + @TestTemplate + public void testFlinkManifests() throws Exception { + long timestamp = 0; + final long checkpoint = 10; + + JobID jobId = new JobID(); + OperatorID operatorId; + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + RowData row1 = SimpleDataUtil.createRowData(1, "hello"); + DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); + + harness.processElement(of(checkpoint, dataFile1), ++timestamp); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + // 1. snapshotState for checkpoint#1 + harness.snapshot(checkpoint, ++timestamp); + List manifestPaths = assertFlinkManifests(1); + Path manifestPath = manifestPaths.get(0); + assertThat(manifestPath.getFileName()) + .asString() + .isEqualTo( + String.format("%s-%s-%05d-%d-%d-%05d.avro", jobId, operatorId, 0, 0, checkpoint, 1)); + + // 2. Read the data files from manifests and assert. + List dataFiles = + FlinkManifestUtil.readDataFiles( + createTestingManifestFile(manifestPath, dataFile1), table.io(), table.specs()); + assertThat(dataFiles).hasSize(1); + TestHelpers.assertEquals(dataFile1, dataFiles.get(0)); + + // 3. notifyCheckpointComplete for checkpoint#1 + harness.notifyOfCompletedCheckpoint(checkpoint); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); + assertFlinkManifests(0); + } + } + + @TestTemplate + public void testDeleteFiles() throws Exception { + assumeThat(formatVersion) + .as("Only support equality-delete in format v2 or later.") + .isGreaterThan(1); + + long timestamp = 0; + long checkpoint = 10; + + JobID jobId = new JobID(); + OperatorID operatorId; + FileAppenderFactory appenderFactory = createDeletableAppenderFactory(); + + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + RowData row1 = SimpleDataUtil.createInsert(1, "aaa"); + DataFile dataFile1 = writeDataFile("data-file-1", ImmutableList.of(row1)); + harness.processElement(of(checkpoint, dataFile1), ++timestamp); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + // 1. snapshotState for checkpoint#1 + harness.snapshot(checkpoint, ++timestamp); + List manifestPaths = assertFlinkManifests(1); + Path manifestPath = manifestPaths.get(0); + assertThat(manifestPath.getFileName()) + .asString() + .isEqualTo( + String.format("%s-%s-%05d-%d-%d-%05d.avro", jobId, operatorId, 0, 0, checkpoint, 1)); + + // 2. Read the data files from manifests and assert. + List dataFiles = + FlinkManifestUtil.readDataFiles( + createTestingManifestFile(manifestPath, dataFile1), table.io(), table.specs()); + assertThat(dataFiles).hasSize(1); + TestHelpers.assertEquals(dataFile1, dataFiles.get(0)); + + // 3. notifyCheckpointComplete for checkpoint#1 + harness.notifyOfCompletedCheckpoint(checkpoint); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); + assertFlinkManifests(0); + + // 4. process both data files and delete files. + RowData row2 = SimpleDataUtil.createInsert(2, "bbb"); + DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(row2)); + + RowData delete1 = SimpleDataUtil.createDelete(1, "aaa"); + DeleteFile deleteFile1 = + writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete1)); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); + harness.processElement( + new FlinkWriteResult( + ++checkpoint, + WriteResult.builder().addDataFiles(dataFile2).addDeleteFiles(deleteFile1).build()), + ++timestamp); + + // 5. snapshotState for checkpoint#2 + harness.snapshot(checkpoint, ++timestamp); + assertFlinkManifests(2); + + // 6. notifyCheckpointComplete for checkpoint#2 + harness.notifyOfCompletedCheckpoint(checkpoint); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row2), branch); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); + assertFlinkManifests(0); + } + } + + @TestTemplate + public void testCommitTwoCheckpointsInSingleTxn() throws Exception { + assumeThat(formatVersion) + .as("Only support equality-delete in format v2 or later.") + .isGreaterThan(1); + + long timestamp = 0; + long checkpoint = 10; + + JobID jobId = new JobID(); + OperatorID operatorId; + FileAppenderFactory appenderFactory = createDeletableAppenderFactory(); + + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + RowData insert1 = SimpleDataUtil.createInsert(1, "aaa"); + RowData insert2 = SimpleDataUtil.createInsert(2, "bbb"); + RowData delete3 = SimpleDataUtil.createDelete(3, "ccc"); + DataFile dataFile1 = writeDataFile("data-file-1", ImmutableList.of(insert1, insert2)); + DeleteFile deleteFile1 = + writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete3)); + harness.processElement( + new FlinkWriteResult( + checkpoint, + WriteResult.builder().addDataFiles(dataFile1).addDeleteFiles(deleteFile1).build()), + ++timestamp); + + // The 1th snapshotState. + harness.snapshot(checkpoint, ++timestamp); + + RowData insert4 = SimpleDataUtil.createInsert(4, "ddd"); + RowData delete2 = SimpleDataUtil.createDelete(2, "bbb"); + DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(insert4)); + DeleteFile deleteFile2 = + writeEqDeleteFile(appenderFactory, "delete-file-2", ImmutableList.of(delete2)); + harness.processElement( + new FlinkWriteResult( + ++checkpoint, + WriteResult.builder().addDataFiles(dataFile2).addDeleteFiles(deleteFile2).build()), + ++timestamp); + + // The 2nd snapshotState. + harness.snapshot(checkpoint, ++timestamp); + + // Notify the 2nd snapshot to complete. + harness.notifyOfCompletedCheckpoint(checkpoint); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(insert1, insert4), branch); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); + assertFlinkManifests(0); + assertThat(table.snapshots()).hasSize(2); + } + } + + /** + * The testcase is to simulate upserting to an Iceberg V2 table, and facing the following + * scenario: + * + *
      + *
    • A specific row is updated + *
    • The prepareSnapshotPreBarrier triggered + *
    • Checkpoint failed for reasons outside of the Iceberg connector + *
    • The specific row is updated again in the second checkpoint as well + *
    • Second snapshot is triggered, and finished + *
    + * + *

    Previously the files from the 2 snapshots were committed in a single Iceberg commit, as a + * results duplicate rows were created in the table. + * + * @throws Exception Exception + */ + @TestTemplate + public void testCommitMultipleCheckpointsForV2Table() throws Exception { + assumeThat(formatVersion) + .as("Only support equality-delete in format v2 or later.") + .isGreaterThan(1); + + long timestamp = 0; + long checkpoint = 10; + + JobID jobId = new JobID(); + OperatorID operatorId; + + FileAppenderFactory appenderFactory = + new FlinkAppenderFactory( + table, + table.schema(), + FlinkSchemaUtil.convert(table.schema()), + table.properties(), + table.spec(), + new int[] {table.schema().findField("id").fieldId()}, + table.schema(), + null); + + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + RowData insert1 = null; + RowData insert2 = null; + for (int i = 1; i <= 3; i++) { + insert1 = SimpleDataUtil.createInsert(1, "aaa" + i); + insert2 = SimpleDataUtil.createInsert(2, "bbb" + i); + DataFile dataFile = writeDataFile("data-file-" + i, ImmutableList.of(insert1, insert2)); + DeleteFile deleteFile = + writeEqDeleteFile( + appenderFactory, "delete-file-" + i, ImmutableList.of(insert1, insert2)); + harness.processElement( + new FlinkWriteResult( + ++checkpoint, + WriteResult.builder().addDataFiles(dataFile).addDeleteFiles(deleteFile).build()), + ++timestamp); + } + + harness.snapshot(checkpoint, ++timestamp); + harness.notifyOfCompletedCheckpoint(checkpoint); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(insert1, insert2), branch); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); + assertFlinkManifests(0); + assertThat(table.snapshots()).hasSize(3); + } + } + + @TestTemplate + public void testSpecEvolution() throws Exception { + long timestamp = 0; + int checkpointId = 0; + List rows = Lists.newArrayList(); + JobID jobId = new JobID(); + + OperatorID operatorId; + OperatorSubtaskState snapshot; + DataFile dataFile; + int specId; + + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertSnapshotSize(0); + + checkpointId++; + RowData rowData = SimpleDataUtil.createRowData(checkpointId, "hello" + checkpointId); + // table unpartitioned + dataFile = writeDataFile("data-" + checkpointId, ImmutableList.of(rowData)); + harness.processElement(of(checkpointId, dataFile), ++timestamp); + rows.add(rowData); + harness.snapshot(checkpointId, ++timestamp); + + specId = + getStagingManifestSpecId(harness.getOperator().getOperatorStateBackend(), checkpointId); + assertThat(specId).isEqualTo(table.spec().specId()); + + harness.notifyOfCompletedCheckpoint(checkpointId); + + // Change partition spec + table.refresh(); + PartitionSpec oldSpec = table.spec(); + table.updateSpec().addField("id").commit(); + + checkpointId++; + rowData = SimpleDataUtil.createRowData(checkpointId, "hello" + checkpointId); + // write data with old partition spec + dataFile = writeDataFile("data-" + checkpointId, ImmutableList.of(rowData), oldSpec, null); + harness.processElement(of(checkpointId, dataFile), ++timestamp); + rows.add(rowData); + snapshot = harness.snapshot(checkpointId, ++timestamp); + + specId = + getStagingManifestSpecId(harness.getOperator().getOperatorStateBackend(), checkpointId); + assertThat(specId).isEqualTo(oldSpec.specId()); + + harness.notifyOfCompletedCheckpoint(checkpointId); + + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, ImmutableList.copyOf(rows), branch); + assertSnapshotSize(checkpointId); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + } + + // Restore from the given snapshot + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(jobId)) { + harness.getStreamConfig().setOperatorID(operatorId); + harness.setup(); + harness.initializeState(snapshot); + harness.open(); + + SimpleDataUtil.assertTableRows(table, rows, branch); + assertSnapshotSize(checkpointId); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + + checkpointId++; + RowData row = SimpleDataUtil.createRowData(checkpointId, "world" + checkpointId); + StructLike partition = new PartitionData(table.spec().partitionType()); + partition.set(0, checkpointId); + dataFile = + writeDataFile("data-" + checkpointId, ImmutableList.of(row), table.spec(), partition); + harness.processElement(of(checkpointId, dataFile), ++timestamp); + rows.add(row); + harness.snapshot(checkpointId, ++timestamp); + assertFlinkManifests(1); + + specId = + getStagingManifestSpecId(harness.getOperator().getOperatorStateBackend(), checkpointId); + assertThat(specId).isEqualTo(table.spec().specId()); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, rows, branch); + assertSnapshotSize(checkpointId); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + } + } + + private int getStagingManifestSpecId(OperatorStateStore operatorStateStore, long checkPointId) + throws Exception { + ListState> checkpointsState = + operatorStateStore.getListState(IcebergFilesCommitter.buildStateDescriptor()); + NavigableMap statedDataFiles = + Maps.newTreeMap(checkpointsState.get().iterator().next()); + DeltaManifests deltaManifests = + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, statedDataFiles.get(checkPointId)); + return deltaManifests.dataManifest().partitionSpecId(); + } + + private DeleteFile writeEqDeleteFile( + FileAppenderFactory appenderFactory, String filename, List deletes) + throws IOException { + return SimpleDataUtil.writeEqDeleteFile(table, format, filename, appenderFactory, deletes); + } + + private DeleteFile writePosDeleteFile( + FileAppenderFactory appenderFactory, + String filename, + List> positions) + throws IOException { + return SimpleDataUtil.writePosDeleteFile(table, format, filename, appenderFactory, positions); + } + + private FileAppenderFactory createDeletableAppenderFactory() { + int[] equalityFieldIds = + new int[] { + table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId() + }; + return new FlinkAppenderFactory( + table, + table.schema(), + FlinkSchemaUtil.convert(table.schema()), + table.properties(), + table.spec(), + equalityFieldIds, + table.schema(), + null); + } + + private ManifestFile createTestingManifestFile(Path manifestPath, DataFile dataFile) + throws IOException { + ManifestWriter writer = + ManifestFiles.write( + formatVersion, + PartitionSpec.unpartitioned(), + table.io().newOutputFile(manifestPath.toString()), + 0L); + writer.add(dataFile); + writer.close(); + return writer.toManifestFile(); + } + + private List assertFlinkManifests(int expectedCount) throws IOException { + List manifests = + Files.list(flinkManifestFolder.toPath()) + .filter(p -> !p.toString().endsWith(".crc")) + .collect(Collectors.toList()); + assertThat(manifests).hasSize(expectedCount); + return manifests; + } + + private DataFile writeDataFile(String filename, List rows) throws IOException { + return SimpleDataUtil.writeFile( + table, + table.schema(), + table.spec(), + CONF, + table.location(), + format.addExtension(filename), + rows); + } + + private DataFile writeDataFile( + String filename, List rows, PartitionSpec spec, StructLike partition) + throws IOException { + return SimpleDataUtil.writeFile( + table, + table.schema(), + spec, + CONF, + table.location(), + format.addExtension(filename), + rows, + partition); + } + + private void assertMaxCommittedCheckpointId(JobID jobID, OperatorID operatorID, long expectedId) { + table.refresh(); + long actualId = + SinkUtil.getMaxCommittedCheckpointId( + table, jobID.toString(), operatorID.toString(), branch); + assertThat(actualId).isEqualTo(expectedId); + } + + private void assertSnapshotSize(int expectedSnapshotSize) { + table.refresh(); + assertThat(table.snapshots()).hasSize(expectedSnapshotSize); + } + + private OneInputStreamOperatorTestHarness createStreamSink(JobID jobID) + throws Exception { + TestOperatorFactory factory = TestOperatorFactory.of(table.location(), branch, table.spec()); + return new OneInputStreamOperatorTestHarness<>(factory, createEnvironment(jobID)); + } + + private static MockEnvironment createEnvironment(JobID jobID) { + return new MockEnvironmentBuilder() + .setTaskName("test task") + .setManagedMemorySize(32 * 1024) + .setInputSplitProvider(new MockInputSplitProvider()) + .setBufferSize(256) + .setTaskConfiguration(new org.apache.flink.configuration.Configuration()) + .setExecutionConfig(new ExecutionConfig()) + .setMaxParallelism(16) + .setJobID(jobID) + .build(); + } + + private static class TestOperatorFactory extends AbstractStreamOperatorFactory + implements OneInputStreamOperatorFactory { + private final String tablePath; + private final String branch; + private final PartitionSpec spec; + + private TestOperatorFactory(String tablePath, String branch, PartitionSpec spec) { + this.tablePath = tablePath; + this.branch = branch; + this.spec = spec; + } + + private static TestOperatorFactory of(String tablePath, String branch, PartitionSpec spec) { + return new TestOperatorFactory(tablePath, branch, spec); + } + + @Override + @SuppressWarnings("unchecked") + public > T createStreamOperator( + StreamOperatorParameters params) { + IcebergFilesCommitter committer = + new IcebergFilesCommitter( + params, + new TestTableLoader(tablePath), + false, + Collections.singletonMap("flink.test", TestIcebergFilesCommitter.class.getName()), + ThreadPools.WORKER_THREAD_POOL_SIZE, + branch, + spec); + return (T) committer; + } + + @Override + public Class getStreamOperatorClass(ClassLoader classLoader) { + return IcebergFilesCommitter.class; + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSink.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSink.java new file mode 100644 index 000000000000..4f2b09ee55ff --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSink.java @@ -0,0 +1,563 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.flink.TestFixtures.DATABASE; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import org.apache.flink.api.dag.Transformation; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.legacy.api.TableSchema; +import org.apache.flink.types.Row; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.sink.IcebergSink.Builder; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestIcebergSink extends TestFlinkIcebergSinkBase { + + private TableLoader tableLoader; + + @Parameter(index = 0) + private FileFormat format; + + @Parameter(index = 1) + private int parallelism; + + @Parameter(index = 2) + private boolean partitioned; + + @Parameter(index = 3) + private boolean isTableSchema; + + @Parameters(name = "format={0}, parallelism={1}, partitioned={2}, isTableSchema={3}") + public static Object[][] parameters() { + return new Object[][] { + // Remove after the deprecation of TableSchema - BEGIN + {FileFormat.AVRO, 1, true, true}, + {FileFormat.AVRO, 1, false, true}, + {FileFormat.AVRO, 2, true, true}, + {FileFormat.AVRO, 2, false, true}, + {FileFormat.ORC, 1, true, true}, + {FileFormat.ORC, 1, false, true}, + {FileFormat.ORC, 2, true, true}, + {FileFormat.ORC, 2, false, true}, + {FileFormat.PARQUET, 1, true, true}, + {FileFormat.PARQUET, 1, false, true}, + {FileFormat.PARQUET, 2, true, true}, + {FileFormat.PARQUET, 2, false, true}, + // Remove after the deprecation of TableSchema - END + + {FileFormat.AVRO, 1, true, false}, + {FileFormat.AVRO, 1, false, false}, + {FileFormat.AVRO, 2, true, false}, + {FileFormat.AVRO, 2, false, false}, + {FileFormat.ORC, 1, true, false}, + {FileFormat.ORC, 1, false, false}, + {FileFormat.ORC, 2, true, false}, + {FileFormat.ORC, 2, false, false}, + {FileFormat.PARQUET, 1, true, false}, + {FileFormat.PARQUET, 1, false, false}, + {FileFormat.PARQUET, 2, true, false}, + {FileFormat.PARQUET, 2, false, false}, + }; + } + + @BeforeEach + void before() throws IOException { + table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); + + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); + + tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @TestTemplate + void testWriteRowData() throws Exception { + List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); + DataStream dataStream = + env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) + .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); + + IcebergSink.forRowData(dataStream) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .append(); + + // Execute the program. + env.execute("Test Iceberg DataStream"); + + // Assert the iceberg table's records. + SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); + } + + @TestTemplate + void testWriteRow() throws Exception { + testWriteRow(null, DistributionMode.NONE); + } + + @TestTemplate + void testWriteRowWithTableSchema() throws Exception { + testWriteRow(SimpleDataUtil.FLINK_SCHEMA, DistributionMode.NONE); + } + + @TestTemplate + void testPartitionWriteMode() throws Exception { + testWriteRow(null, DistributionMode.HASH); + if (partitioned) { + assertThat(partitionFiles("aaa")) + .as("There should be only 1 data file in partition 'aaa'") + .isEqualTo(1); + assertThat(partitionFiles("bbb")) + .as("There should be only 1 data file in partition 'bbb'") + .isEqualTo(1); + assertThat(partitionFiles("ccc")) + .as("There should be only 1 data file in partition 'ccc'") + .isEqualTo(1); + } + } + + @TestTemplate + void testShuffleByPartitionWithSchema() throws Exception { + testWriteRow(SimpleDataUtil.FLINK_SCHEMA, DistributionMode.HASH); + if (partitioned) { + assertThat(partitionFiles("aaa")) + .as("There should be only 1 data file in partition 'aaa'") + .isEqualTo(1); + assertThat(partitionFiles("bbb")) + .as("There should be only 1 data file in partition 'bbb'") + .isEqualTo(1); + assertThat(partitionFiles("ccc")) + .as("There should be only 1 data file in partition 'ccc'") + .isEqualTo(1); + } + } + + @TestTemplate + void testTwoSinksInDisjointedDAG() throws Exception { + Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); + + Table leftTable = + CATALOG_EXTENSION + .catalog() + .createTable( + TableIdentifier.of("left"), + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + props); + TableLoader leftTableLoader = + TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), TableIdentifier.of("left")); + + Table rightTable = + CATALOG_EXTENSION + .catalog() + .createTable( + TableIdentifier.of("right"), + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + props); + TableLoader rightTableLoader = + TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), TableIdentifier.of("right")); + + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); + env.getConfig().disableAutoGeneratedUIDs(); + + List leftRows = createRows("left-"); + DataStream leftStream = + env.addSource(createBoundedSource(leftRows), ROW_TYPE_INFO) + .name("leftCustomSource") + .uid("leftCustomSource"); + + if (isTableSchema) { + IcebergSink.forRow(leftStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(leftTable) + .tableLoader(leftTableLoader) + .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) + .distributionMode(DistributionMode.NONE) + .uidSuffix("leftIcebergSink") + .append(); + } else { + IcebergSink.forRow(leftStream, SimpleDataUtil.FLINK_SCHEMA) + .table(leftTable) + .tableLoader(leftTableLoader) + .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) + .distributionMode(DistributionMode.NONE) + .uidSuffix("leftIcebergSink") + .append(); + } + + List rightRows = createRows("right-"); + DataStream rightStream = + env.addSource(createBoundedSource(rightRows), ROW_TYPE_INFO) + .name("rightCustomSource") + .uid("rightCustomSource"); + + if (isTableSchema) { + IcebergSink.forRow(rightStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(rightTable) + .tableLoader(rightTableLoader) + .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) + .writeParallelism(parallelism) + .distributionMode(DistributionMode.HASH) + .uidSuffix("rightIcebergSink") + .setSnapshotProperty("flink.test", TestIcebergSink.class.getName()) + .snapshotProperties(Collections.singletonMap("direction", "rightTable")) + .append(); + } else { + IcebergSink.forRow(rightStream, SimpleDataUtil.FLINK_SCHEMA) + .table(rightTable) + .tableLoader(rightTableLoader) + .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) + .writeParallelism(parallelism) + .distributionMode(DistributionMode.HASH) + .uidSuffix("rightIcebergSink") + .setSnapshotProperty("flink.test", TestIcebergSink.class.getName()) + .snapshotProperties(Collections.singletonMap("direction", "rightTable")) + .append(); + } + + // Execute the program. + env.execute("Test Iceberg DataStream."); + + SimpleDataUtil.assertTableRows(leftTable, convertToRowData(leftRows)); + SimpleDataUtil.assertTableRows(rightTable, convertToRowData(rightRows)); + + leftTable.refresh(); + + assertThat(leftTable.currentSnapshot().summary().get("flink.test")).isNull(); + assertThat(leftTable.currentSnapshot().summary().get("direction")).isNull(); + + assertThat(rightTable.currentSnapshot().summary().get("flink.test")) + .isEqualTo(TestIcebergSink.class.getName()); + assertThat(rightTable.currentSnapshot().summary().get("direction")).isEqualTo("rightTable"); + } + + @TestTemplate + void testOverrideWriteConfigWithUnknownFileFormat() { + Map newProps = Maps.newHashMap(); + newProps.put(FlinkWriteOptions.WRITE_FORMAT.key(), "UNRECOGNIZED"); + + List rows = createRows(""); + DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); + + Builder builder = + isTableSchema + ? IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .setAll(newProps) + .uidSuffix("ingestion") + : IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .setAll(newProps) + .uidSuffix("ingestion"); + assertThatThrownBy(builder::append) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid file format: UNRECOGNIZED"); + } + + @TestTemplate + void testWriteRowWithTableRefreshInterval() throws Exception { + List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); + DataStream dataStream = + env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) + .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); + + Configuration flinkConf = new Configuration(); + flinkConf.setString(FlinkWriteOptions.TABLE_REFRESH_INTERVAL.key(), "100ms"); + + IcebergSink.forRowData(dataStream) + .table(table) + .tableLoader(tableLoader) + .flinkConf(flinkConf) + .writeParallelism(parallelism) + .append(); + + // Execute the program. + env.execute("Test Iceberg DataStream"); + + // Assert the iceberg table's records. + SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); + } + + @TestTemplate + void testOperatorsUidNameNoUidSuffix() { + List rows = createRows(""); + DataStream dataStream = + env.addSource(createBoundedSource(rows), ROW_TYPE_INFO).uid("mySourceId"); + + if (isTableSchema) { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) + .writeParallelism(parallelism) + .distributionMode(DistributionMode.HASH) + .append(); + } else { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) + .writeParallelism(parallelism) + .distributionMode(DistributionMode.HASH) + .append(); + } + + Transformation firstTransformation = env.getTransformations().get(0); + Transformation secondTransformation = env.getTransformations().get(1); + assertThat(firstTransformation.getUid()).isEqualTo("Sink pre-writer mapper: hadoop.default.t"); + assertThat(firstTransformation.getName()).isEqualTo("Sink pre-writer mapper: hadoop.default.t"); + assertThat(secondTransformation.getUid()).isEqualTo("hadoop.default.t"); + assertThat(secondTransformation.getName()).isEqualTo("hadoop.default.t"); + } + + @TestTemplate + void testOperatorsUidNameWitUidSuffix() { + List rows = createRows(""); + DataStream dataStream = + env.addSource(createBoundedSource(rows), ROW_TYPE_INFO).uid("mySourceId"); + + if (isTableSchema) { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) + .writeParallelism(parallelism) + .distributionMode(DistributionMode.HASH) + .uidSuffix("data-ingestion") + .append(); + } else { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) + .writeParallelism(parallelism) + .distributionMode(DistributionMode.HASH) + .uidSuffix("data-ingestion") + .append(); + } + + Transformation firstTransformation = env.getTransformations().get(0); + Transformation secondTransformation = env.getTransformations().get(1); + assertThat(firstTransformation.getUid()).isEqualTo("Sink pre-writer mapper: data-ingestion"); + assertThat(firstTransformation.getName()).isEqualTo("Sink pre-writer mapper: data-ingestion"); + assertThat(secondTransformation.getUid()).isEqualTo("data-ingestion"); + assertThat(secondTransformation.getName()).isEqualTo("data-ingestion"); + } + + @TestTemplate + void testErrorOnNullForRequiredField() { + assumeThat(format) + .as("ORC file format supports null values even for required fields.") + .isNotEqualTo(FileFormat.ORC); + + Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "id2", Types.IntegerType.get()), + Types.NestedField.required(2, "data2", Types.StringType.get())); + TableIdentifier tableIdentifier = TableIdentifier.of(DATABASE, "t2"); + Table table2 = + CATALOG_EXTENSION + .catalog() + .createTable( + tableIdentifier, + icebergSchema, + PartitionSpec.unpartitioned(), + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); + + // Null out a required field + List rows = List.of(Row.of(42, null)); + + env = StreamExecutionEnvironment.getExecutionEnvironment(); + + DataStream dataStream = + env.addSource(createBoundedSource(rows), ROW_TYPE_INFO).uid("mySourceId"); + + if (isTableSchema) { + TableSchema flinkSchema = FlinkSchemaUtil.toSchema(icebergSchema); + IcebergSink.forRow(dataStream, flinkSchema) + .table(table2) + .tableLoader(TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), tableIdentifier)) + .tableSchema(flinkSchema) + .writeParallelism(parallelism) + .append(); + } else { + ResolvedSchema flinkSchema = FlinkSchemaUtil.toResolvedSchema(icebergSchema); + IcebergSink.forRow(dataStream, flinkSchema) + .table(table2) + .tableLoader(TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), tableIdentifier)) + .resolvedSchema(flinkSchema) + .writeParallelism(parallelism) + .append(); + } + + assertThatThrownBy(() -> env.execute()).hasRootCauseInstanceOf(NullPointerException.class); + } + + @TestTemplate + void testDefaultWriteParallelism() { + List rows = createRows(""); + DataStream dataStream = + env.addSource(createBoundedSource(rows), ROW_TYPE_INFO).uid("mySourceId"); + + var sink = + isTableSchema + ? IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) + .distributionMode(DistributionMode.NONE) + .append() + : IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) + .distributionMode(DistributionMode.NONE) + .append(); + + // since the sink write parallelism was null, it asserts that the default parallelism used was + // the input source parallelism. + // sink.getTransformation is referring to the SinkV2 Writer Operator associated to the + // IcebergSink + assertThat(sink.getTransformation().getParallelism()).isEqualTo(dataStream.getParallelism()); + } + + @TestTemplate + void testWriteParallelism() { + List rows = createRows(""); + + // the parallelism of this input source is always 1, as this is a non-parallel source. + DataStream dataStream = + env.addSource(createBoundedSource(rows), ROW_TYPE_INFO).uid("mySourceId"); + + var sink = + isTableSchema + ? IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) + .distributionMode(DistributionMode.NONE) + .writeParallelism(parallelism) + .append() + : IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) + .distributionMode(DistributionMode.NONE) + .writeParallelism(parallelism) + .append(); + + // The parallelism has been properly specified when creating the IcebergSink, so this asserts + // that its value is the same as the parallelism TestTemplate parameter + // sink.getTransformation is referring to the SinkV2 Writer Operator associated to the + // IcebergSink + assertThat(sink.getTransformation().getParallelism()).isEqualTo(parallelism); + } + + private void testWriteRow(ResolvedSchema resolvedSchema, DistributionMode distributionMode) + throws Exception { + List rows = createRows(""); + DataStream dataStream = + env.addSource(createBoundedSource(rows), ROW_TYPE_INFO).uid("mySourceId"); + + if (isTableSchema) { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .tableSchema( + resolvedSchema == null ? null : TableSchema.fromResolvedSchema(resolvedSchema)) + .writeParallelism(parallelism) + .distributionMode(distributionMode) + .append(); + } else { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .resolvedSchema(resolvedSchema) + .writeParallelism(parallelism) + .distributionMode(distributionMode) + .append(); + } + + // Execute the program. + env.execute("Test Iceberg DataStream."); + + SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkBranch.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkBranch.java new file mode 100644 index 000000000000..ddcb57f6ca33 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkBranch.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.util.List; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.legacy.api.TableSchema; +import org.apache.flink.types.Row; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestIcebergSinkBranch extends TestFlinkIcebergSinkBase { + + @Parameter(index = 0) + private String branch; + + @Parameter(index = 1) + private boolean isTableSchema; + + @Parameters(name = "branch = {0}, isTableSchema = {1}") + public static Object[][] parameters() { + return new Object[][] { + // Remove after the deprecation of TableSchema - BEGIN + {"main", true}, + {"testBranch", true}, + // Remove after the deprecation of TableSchema - END + + {"main", false}, + {"testBranch", false}, + }; + } + + @BeforeEach + public void before() throws IOException { + table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + PartitionSpec.unpartitioned(), + ImmutableMap.of( + TableProperties.DEFAULT_FILE_FORMAT, + FileFormat.AVRO.name(), + TableProperties.FORMAT_VERSION, + "1")); + + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100); + + tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @TestTemplate + public void testWriteRowWithTableSchema() throws Exception { + testWriteRow(SimpleDataUtil.FLINK_SCHEMA, DistributionMode.NONE); + verifyOtherBranchUnmodified(); + } + + private void testWriteRow(ResolvedSchema resolvedSchema, DistributionMode distributionMode) + throws Exception { + List rows = createRows(""); + DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); + + if (isTableSchema) { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .tableSchema(TableSchema.fromResolvedSchema(resolvedSchema)) + .toBranch(branch) + .distributionMode(distributionMode) + .append(); + } else { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .resolvedSchema(resolvedSchema) + .toBranch(branch) + .distributionMode(distributionMode) + .append(); + } + + // Execute the program. + env.execute("Test Iceberg DataStream."); + + SimpleDataUtil.assertTableRows(table, convertToRowData(rows), branch); + SimpleDataUtil.assertTableRows( + table, + ImmutableList.of(), + branch.equals(SnapshotRef.MAIN_BRANCH) ? "test-branch" : SnapshotRef.MAIN_BRANCH); + + verifyOtherBranchUnmodified(); + } + + private void verifyOtherBranchUnmodified() { + String otherBranch = + branch.equals(SnapshotRef.MAIN_BRANCH) ? "test-branch" : SnapshotRef.MAIN_BRANCH; + if (otherBranch.equals(SnapshotRef.MAIN_BRANCH)) { + assertThat(table.currentSnapshot()).isNull(); + } + + assertThat(table.snapshot(otherBranch)).isNull(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkCompact.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkCompact.java new file mode 100644 index 000000000000..b84d21d020b3 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkCompact.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import org.apache.flink.runtime.jobgraph.JobVertex; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.graph.StreamGraph; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.Row; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.ManifestFiles; +import org.apache.iceberg.ManifestReader; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.actions.SizeBasedFileRewritePlanner; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.maintenance.api.LockConfig; +import org.apache.iceberg.flink.maintenance.api.RewriteDataFilesConfig; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +class TestIcebergSinkCompact extends TestFlinkIcebergSinkBase { + + private Map flinkConf; + + @BeforeEach + void before() throws IOException { + this.flinkConf = Maps.newHashMap(); + flinkConf.put(FlinkWriteOptions.COMPACTION_ENABLE.key(), "true"); + flinkConf.put(LockConfig.LOCK_TYPE_OPTION.key(), LockConfig.JdbcLockConfig.JDBC); + flinkConf.put( + LockConfig.JdbcLockConfig.JDBC_URI_OPTION.key(), + "jdbc:sqlite:file::memory:?ic" + UUID.randomUUID().toString().replace("-", "")); + flinkConf.put(LockConfig.LOCK_ID_OPTION.key(), "test-lock-id"); + flinkConf.put(RewriteDataFilesConfig.SCHEDULE_ON_DATA_FILE_SIZE, "1"); + + flinkConf.put(LockConfig.JdbcLockConfig.JDBC_INIT_LOCK_TABLE_OPTION.key(), "true"); + flinkConf.put(RewriteDataFilesConfig.PREFIX + SizeBasedFileRewritePlanner.REWRITE_ALL, "true"); + + table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + PartitionSpec.unpartitioned(), + Maps.newHashMap()); + + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100); + + tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @Test + public void testCompactFileE2e() throws Exception { + List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); + DataStream dataStream = + env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) + .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); + + IcebergSink.forRowData(dataStream) + .setAll(flinkConf) + .table(table) + .tableLoader(tableLoader) + .append(); + + env.execute("Test Iceberg Compaction DataStream"); + + table.refresh(); + // check the data file count after compact + List afterCompactDataFiles = getDataFiles(table.currentSnapshot(), table); + assertThat(afterCompactDataFiles).hasSize(1); + + // check the data file count before compact + List preCompactDataFiles = + getDataFiles(table.snapshot(table.currentSnapshot().parentId()), table); + assertThat(preCompactDataFiles).hasSize(3); + } + + private List getDataFiles(Snapshot snapshot, Table table) throws IOException { + List dataFiles = Lists.newArrayList(); + for (ManifestFile dataManifest : snapshot.dataManifests(table.io())) { + try (ManifestReader reader = ManifestFiles.read(dataManifest, table.io())) { + reader.iterator().forEachRemaining(dataFiles::add); + } + } + + return dataFiles; + } + + @Test + public void testTableMaintenanceOperatorAdded() { + List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); + DataStream dataStream = + env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) + .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); + + IcebergSink.forRowData(dataStream) + .table(table) + .tableLoader(tableLoader) + .setAll(flinkConf) + .append(); + + boolean containRewrite = false; + StreamGraph streamGraph = env.getStreamGraph(); + for (JobVertex vertex : streamGraph.getJobGraph().getVertices()) { + if (vertex.getName().contains("Rewrite")) { + containRewrite = true; + break; + } + } + + assertThat(containRewrite).isTrue(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2.java new file mode 100644 index 000000000000..f873dcd99c06 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2.java @@ -0,0 +1,287 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.flink.TestFixtures.DATABASE; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.util.List; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.MetadataColumns; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.BoundedTestSource; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; + +@ExtendWith(ParameterizedTestExtension.class) +@Timeout(value = 60) +public class TestIcebergSinkV2 extends TestFlinkIcebergSinkV2Base { + @RegisterExtension + public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + @RegisterExtension + private static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); + + @BeforeEach + public void setupTable() { + table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + ImmutableMap.of( + TableProperties.DEFAULT_FILE_FORMAT, + format.name(), + TableProperties.FORMAT_VERSION, + String.valueOf(FORMAT_V2))); + + table + .updateProperties() + .set(TableProperties.DEFAULT_FILE_FORMAT, format.name()) + .set(TableProperties.WRITE_DISTRIBUTION_MODE, writeDistributionMode) + .commit(); + + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100L) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); + + tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @TestTemplate + public void testCheckAndGetEqualityFieldIds() { + table + .updateSchema() + .allowIncompatibleChanges() + .addRequiredColumn("type", Types.StringType.get()) + .setIdentifierFields("type") + .commit(); + + DataStream dataStream = + env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); + IcebergSink.Builder builder = + isTableSchema + ? IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA).table(table) + : IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA).table(table); + + // Use user-provided equality field column as equality field id list + builder.equalityFieldColumns(Lists.newArrayList("id")); + assertThat(SinkUtil.checkAndGetEqualityFieldIds(table, Lists.newArrayList("id"))) + .containsExactlyInAnyOrder(table.schema().findField("id").fieldId()); + } + + @TestTemplate + public void testChangeLogOnIdKey() throws Exception { + testChangeLogOnIdKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testUpsertOnlyDeletesOnDataKey() throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of(row("+I", 1, "aaa")), + ImmutableList.of(row("-D", 1, "aaa"), row("-D", 2, "bbb"))); + + List> expectedRecords = + ImmutableList.of(ImmutableList.of(record(1, "aaa")), ImmutableList.of()); + + testChangeLogs( + ImmutableList.of("data"), + row -> row.getField(ROW_DATA_POS), + true, + elementsPerCheckpoint, + expectedRecords, + SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testChangeLogOnDataKey() throws Exception { + testChangeLogOnDataKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testChangeLogOnIdDataKey() throws Exception { + testChangeLogOnIdDataKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testChangeLogOnSameKey() throws Exception { + testChangeLogOnSameKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testUpsertModeCheck() { + DataStream dataStream = + env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); + IcebergSink.Builder builder = + isTableSchema + ? IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .tableLoader(tableLoader) + .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) + .writeParallelism(parallelism) + .upsert(true) + : IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .tableLoader(tableLoader) + .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) + .writeParallelism(parallelism) + .upsert(true); + + assertThatThrownBy( + () -> + builder + .equalityFieldColumns(ImmutableList.of("id", "data")) + .overwrite(true) + .append()) + .isInstanceOf(IllegalStateException.class) + .hasMessage( + "OVERWRITE mode shouldn't be enable when configuring to use UPSERT data stream."); + + assertThatThrownBy( + () -> builder.equalityFieldColumns(ImmutableList.of()).overwrite(false).append()) + .isInstanceOf(IllegalStateException.class) + .hasMessage( + "Equality field columns shouldn't be empty when configuring to use UPSERT data stream."); + } + + @TestTemplate + public void testUpsertOnIdKey() throws Exception { + testUpsertOnIdKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testUpsertOnDataKey() throws Exception { + testUpsertOnDataKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testUpsertOnIdDataKey() throws Exception { + testUpsertOnIdDataKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testDeleteStats() throws Exception { + assumeThat(format).isNotEqualTo(FileFormat.AVRO); + + List> elementsPerCheckpoint = + ImmutableList.of( + // Checkpoint #1 + ImmutableList.of(row("+I", 1, "aaa"), row("-D", 1, "aaa"), row("+I", 1, "aaa"))); + + List> expectedRecords = ImmutableList.of(ImmutableList.of(record(1, "aaa"))); + + testChangeLogs( + ImmutableList.of("id", "data"), + row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), + false, + elementsPerCheckpoint, + expectedRecords, + "main"); + + DeleteFile deleteFile = table.currentSnapshot().addedDeleteFiles(table.io()).iterator().next(); + String fromStat = + new String( + deleteFile.lowerBounds().get(MetadataColumns.DELETE_FILE_PATH.fieldId()).array()); + DataFile dataFile = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); + assumeThat(fromStat).isEqualTo(dataFile.location()); + } + + protected void testChangeLogs( + List equalityFieldColumns, + KeySelector keySelector, + boolean insertAsUpsert, + List> elementsPerCheckpoint, + List> expectedRecordsPerCheckpoint, + String branch) + throws Exception { + DataStream dataStream = + env.addSource(new BoundedTestSource<>(elementsPerCheckpoint), ROW_TYPE_INFO); + + if (isTableSchema) { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) + .tableLoader(tableLoader) + .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) + .writeParallelism(parallelism) + .equalityFieldColumns(equalityFieldColumns) + .upsert(insertAsUpsert) + .toBranch(branch) + .uidSuffix("sink") + .append(); + } else { + IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .tableLoader(tableLoader) + .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) + .writeParallelism(parallelism) + .equalityFieldColumns(equalityFieldColumns) + .upsert(insertAsUpsert) + .toBranch(branch) + .uidSuffix("sink") + .append(); + } + + // Execute the program. + env.execute("Test Iceberg Change-Log DataStream."); + + table.refresh(); + List snapshots = findValidSnapshots(); + int expectedSnapshotNum = expectedRecordsPerCheckpoint.size(); + assertThat(snapshots).hasSize(expectedSnapshotNum); + + for (int i = 0; i < expectedSnapshotNum; i++) { + long snapshotId = snapshots.get(i).snapshotId(); + List expectedRecords = expectedRecordsPerCheckpoint.get(i); + assertThat(actualRowSet(snapshotId, "*")) + .as("Should have the expected records for the checkpoint#" + i) + .isEqualTo(expectedRowSet(expectedRecords.toArray(new Record[0]))); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2Branch.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2Branch.java new file mode 100644 index 000000000000..4896f7f48c17 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2Branch.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestIcebergSinkV2Branch extends TestFlinkIcebergSinkV2Branch { + + @BeforeEach + @Override + public void before() throws IOException { + table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + PartitionSpec.unpartitioned(), + ImmutableMap.of( + TableProperties.DEFAULT_FILE_FORMAT, + FileFormat.AVRO.name(), + TableProperties.FORMAT_VERSION, + "2")); + + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100); + + tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @TestTemplate + public void testChangeLogOnIdKey() throws Exception { + testChangeLogOnIdKey(branch); + verifyOtherBranchUnmodified(); + } + + @TestTemplate + public void testChangeLogOnDataKey() throws Exception { + testChangeLogOnDataKey(branch); + verifyOtherBranchUnmodified(); + } + + @TestTemplate + public void testChangeLogOnIdDataKey() throws Exception { + testChangeLogOnIdDataKey(branch); + verifyOtherBranchUnmodified(); + } + + @TestTemplate + public void testUpsertOnIdKey() throws Exception { + testUpsertOnIdKey(branch); + verifyOtherBranchUnmodified(); + } + + @TestTemplate + public void testUpsertOnDataKey() throws Exception { + testUpsertOnDataKey(branch); + verifyOtherBranchUnmodified(); + } + + @TestTemplate + public void testUpsertOnIdDataKey() throws Exception { + testUpsertOnIdDataKey(branch); + verifyOtherBranchUnmodified(); + } + + private void verifyOtherBranchUnmodified() { + String otherBranch = + branch.equals(SnapshotRef.MAIN_BRANCH) ? "test-branch" : SnapshotRef.MAIN_BRANCH; + if (otherBranch.equals(SnapshotRef.MAIN_BRANCH)) { + assertThat(table.currentSnapshot()); + } + + assertThat(table.snapshot(otherBranch)).isNull(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java new file mode 100644 index 000000000000..7f4f7758e519 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java @@ -0,0 +1,409 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestIcebergStreamWriter { + @TempDir protected java.nio.file.Path temporaryFolder; + + private Table table; + + @Parameter(index = 0) + private FileFormat format; + + @Parameter(index = 1) + private boolean partitioned; + + @Parameters(name = "format = {0}, partitioned = {1}") + public static Object[][] parameters() { + return new Object[][] { + {FileFormat.AVRO, true}, + {FileFormat.AVRO, false}, + {FileFormat.ORC, true}, + {FileFormat.ORC, false}, + {FileFormat.PARQUET, true}, + {FileFormat.PARQUET, false} + }; + } + + @BeforeEach + public void before() throws IOException { + File folder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); + // Construct the iceberg table. + Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); + table = SimpleDataUtil.createTable(folder.getAbsolutePath(), props, partitioned); + } + + @TestTemplate + public void testWritingTable() throws Exception { + long checkpointId = 1L; + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { + // The first checkpoint + testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); + testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 1); + testHarness.processElement(SimpleDataUtil.createRowData(3, "hello"), 1); + + testHarness.prepareSnapshotPreBarrier(checkpointId); + int expectedDataFiles = partitioned ? 2 : 1; + WriteResult result = + WriteResult.builder().addAll(getWriteResults(testHarness.extractOutputValues())).build(); + assertThat(result.deleteFiles()).isEmpty(); + assertThat(result.dataFiles()).hasSize(expectedDataFiles); + + checkpointId = checkpointId + 1; + + // The second checkpoint + testHarness.processElement(SimpleDataUtil.createRowData(4, "foo"), 1); + testHarness.processElement(SimpleDataUtil.createRowData(5, "bar"), 2); + + testHarness.prepareSnapshotPreBarrier(checkpointId); + expectedDataFiles = partitioned ? 4 : 2; + result = + WriteResult.builder().addAll(getWriteResults(testHarness.extractOutputValues())).build(); + assertThat(result.deleteFiles()).isEmpty(); + assertThat(result.dataFiles()).hasSize(expectedDataFiles); + + // Commit the iceberg transaction. + AppendFiles appendFiles = table.newAppend(); + Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); + appendFiles.commit(); + + // Assert the table records. + SimpleDataUtil.assertTableRecords( + table, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "hello"), + SimpleDataUtil.createRecord(2, "world"), + SimpleDataUtil.createRecord(3, "hello"), + SimpleDataUtil.createRecord(4, "foo"), + SimpleDataUtil.createRecord(5, "bar"))); + } + } + + @TestTemplate + public void testSnapshotTwice() throws Exception { + long checkpointId = 1; + long timestamp = 1; + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { + testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), timestamp++); + testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), timestamp); + + testHarness.prepareSnapshotPreBarrier(checkpointId++); + int expectedDataFiles = partitioned ? 2 : 1; + WriteResult result = + WriteResult.builder().addAll(getWriteResults(testHarness.extractOutputValues())).build(); + assertThat(result.deleteFiles()).isEmpty(); + assertThat(result.dataFiles()).hasSize(expectedDataFiles); + + // snapshot again immediately. + for (int i = 0; i < 5; i++) { + testHarness.prepareSnapshotPreBarrier(checkpointId++); + + result = + WriteResult.builder() + .addAll(getWriteResults(testHarness.extractOutputValues())) + .build(); + assertThat(result.deleteFiles()).isEmpty(); + assertThat(result.dataFiles()).hasSize(expectedDataFiles); + } + } + } + + @TestTemplate + public void testTableWithoutSnapshot() throws Exception { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { + assertThat(testHarness.extractOutputValues()).isEmpty(); + } + // Even if we closed the iceberg stream writer, there's no orphan data file. + assertThat(scanDataFiles()).isEmpty(); + + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { + testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); + // Still not emit the data file yet, because there is no checkpoint. + assertThat(testHarness.extractOutputValues()).isEmpty(); + } + // Once we closed the iceberg stream writer, there will left an orphan data file. + assertThat(scanDataFiles()).hasSize(1); + } + + private Set scanDataFiles() throws IOException { + Path dataDir = new Path(table.location(), "data"); + FileSystem fs = FileSystem.get(new Configuration()); + if (!fs.exists(dataDir)) { + return ImmutableSet.of(); + } else { + Set paths = Sets.newHashSet(); + RemoteIterator iterators = fs.listFiles(dataDir, true); + while (iterators.hasNext()) { + LocatedFileStatus status = iterators.next(); + if (status.isFile()) { + Path path = status.getPath(); + if (path.getName().endsWith("." + format.toString().toLowerCase(Locale.ROOT))) { + paths.add(path.toString()); + } + } + } + return paths; + } + } + + @TestTemplate + public void testBoundedStreamCloseWithEmittingDataFiles() throws Exception { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { + testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); + testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 2); + + assertThat(testHarness.getOneInputOperator()).isInstanceOf(BoundedOneInput.class); + ((BoundedOneInput) testHarness.getOneInputOperator()).endInput(); + + int expectedDataFiles = partitioned ? 2 : 1; + WriteResult result = + WriteResult.builder().addAll(getWriteResults(testHarness.extractOutputValues())).build(); + assertThat(result.deleteFiles()).isEmpty(); + assertThat(result.dataFiles()).hasSize(expectedDataFiles); + + ((BoundedOneInput) testHarness.getOneInputOperator()).endInput(); + + result = + WriteResult.builder().addAll(getWriteResults(testHarness.extractOutputValues())).build(); + assertThat(result.deleteFiles()).isEmpty(); + // Datafiles should not be sent again + assertThat(result.dataFiles()).hasSize(expectedDataFiles); + } + } + + @TestTemplate + public void testBoundedStreamTriggeredEndInputBeforeTriggeringCheckpoint() throws Exception { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { + testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); + testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 2); + + testHarness.endInput(); + + int expectedDataFiles = partitioned ? 2 : 1; + WriteResult result = + WriteResult.builder().addAll(getWriteResults(testHarness.extractOutputValues())).build(); + assertThat(result.deleteFiles()).isEmpty(); + assertThat(result.dataFiles()).hasSize(expectedDataFiles); + + testHarness.prepareSnapshotPreBarrier(1L); + + result = + WriteResult.builder().addAll(getWriteResults(testHarness.extractOutputValues())).build(); + assertThat(result.deleteFiles()).isEmpty(); + // It should be ensured that after endInput is triggered, when prepareSnapshotPreBarrier + // is triggered, write should only send WriteResult once + assertThat(result.dataFiles()).hasSize(expectedDataFiles); + } + } + + @TestTemplate + public void testTableWithTargetFileSize() throws Exception { + // Adjust the target-file-size in table properties. + table + .updateProperties() + .set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, "4") // ~4 bytes; low enough to trigger + .commit(); + + List rows = Lists.newArrayListWithCapacity(8000); + List records = Lists.newArrayListWithCapacity(8000); + for (int i = 0; i < 2000; i++) { + for (String data : new String[] {"a", "b", "c", "d"}) { + rows.add(SimpleDataUtil.createRowData(i, data)); + records.add(SimpleDataUtil.createRecord(i, data)); + } + } + + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { + for (RowData row : rows) { + testHarness.processElement(row, 1); + } + + // snapshot the operator. + testHarness.prepareSnapshotPreBarrier(1); + WriteResult result = + WriteResult.builder().addAll(getWriteResults(testHarness.extractOutputValues())).build(); + assertThat(result.deleteFiles()).isEmpty(); + assertThat(result.dataFiles()).hasSize(8); + + // Assert that the data file have the expected records. + for (DataFile dataFile : result.dataFiles()) { + assertThat(dataFile.recordCount()).isEqualTo(1000); + } + + // Commit the iceberg transaction. + AppendFiles appendFiles = table.newAppend(); + Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); + appendFiles.commit(); + } + + // Assert the table records. + SimpleDataUtil.assertTableRecords(table, records); + } + + @TestTemplate + public void testPromotedFlinkDataType() throws Exception { + Schema iSchema = + new Schema( + Types.NestedField.required(1, "tinyint", Types.IntegerType.get()), + Types.NestedField.required(2, "smallint", Types.IntegerType.get()), + Types.NestedField.optional(3, "int", Types.IntegerType.get())); + ResolvedSchema flinkSchema = + ResolvedSchema.of( + Column.physical("tinyint", DataTypes.TINYINT().notNull()), + Column.physical("smallint", DataTypes.SMALLINT().notNull()), + Column.physical("int", DataTypes.INT().nullable())); + + PartitionSpec spec; + if (partitioned) { + spec = + PartitionSpec.builderFor(iSchema) + .identity("smallint") + .identity("tinyint") + .identity("int") + .build(); + } else { + spec = PartitionSpec.unpartitioned(); + } + + String location = + Files.createTempDirectory(temporaryFolder, "junit").toFile().getAbsolutePath(); + Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); + Table icebergTable = new HadoopTables().create(iSchema, spec, props, location); + + List rows = + Lists.newArrayList( + GenericRowData.of((byte) 0x01, (short) -32768, 101), + GenericRowData.of((byte) 0x02, (short) 0, 102), + GenericRowData.of((byte) 0x03, (short) 32767, 103)); + + Record record = GenericRecord.create(iSchema); + List expected = + Lists.newArrayList( + record.copy(ImmutableMap.of("tinyint", 1, "smallint", -32768, "int", 101)), + record.copy(ImmutableMap.of("tinyint", 2, "smallint", 0, "int", 102)), + record.copy(ImmutableMap.of("tinyint", 3, "smallint", 32767, "int", 103))); + + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter(icebergTable, flinkSchema)) { + for (RowData row : rows) { + testHarness.processElement(row, 1); + } + testHarness.prepareSnapshotPreBarrier(1); + WriteResult result = + WriteResult.builder().addAll(getWriteResults(testHarness.extractOutputValues())).build(); + assertThat(result.deleteFiles()).isEmpty(); + assertThat(result.dataFiles()).hasSize(partitioned ? 3 : 1); + + // Commit the iceberg transaction. + AppendFiles appendFiles = icebergTable.newAppend(); + Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); + appendFiles.commit(); + } + + SimpleDataUtil.assertTableRecords(location, expected); + } + + private static List getWriteResults(List flinkWriteResults) { + return flinkWriteResults.stream() + .map(FlinkWriteResult::writeResult) + .collect(Collectors.toList()); + } + + private OneInputStreamOperatorTestHarness createIcebergStreamWriter() + throws Exception { + return createIcebergStreamWriter(table, SimpleDataUtil.FLINK_SCHEMA); + } + + private OneInputStreamOperatorTestHarness createIcebergStreamWriter( + Table icebergTable, ResolvedSchema flinkSchema) throws Exception { + RowType flinkRowType = FlinkSink.toFlinkRowType(icebergTable.schema(), flinkSchema); + FlinkWriteConf flinkWriteConfig = + new FlinkWriteConf( + icebergTable, Maps.newHashMap(), new org.apache.flink.configuration.Configuration()); + + IcebergStreamWriter streamWriter = + FlinkSink.createStreamWriter(() -> icebergTable, flinkWriteConfig, flinkRowType, null); + OneInputStreamOperatorTestHarness harness = + new OneInputStreamOperatorTestHarness<>(streamWriter, 1, 1, 0); + + harness.setup(); + harness.open(); + + return harness; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java new file mode 100644 index 000000000000..919fef579ab0 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java @@ -0,0 +1,251 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.InternalRecordWrapper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.data.RandomRowData; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestRowDataPartitionKey { + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(0, "boolType", Types.BooleanType.get()), + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "longType", Types.LongType.get()), + Types.NestedField.required(3, "dateType", Types.DateType.get()), + Types.NestedField.required(4, "timeType", Types.TimeType.get()), + Types.NestedField.required(5, "stringType", Types.StringType.get()), + Types.NestedField.required(6, "timestampWithoutZone", Types.TimestampType.withoutZone()), + Types.NestedField.required(7, "timestampWithZone", Types.TimestampType.withZone()), + Types.NestedField.required(8, "fixedType", Types.FixedType.ofLength(5)), + Types.NestedField.required(9, "uuidType", Types.UUIDType.get()), + Types.NestedField.required(10, "binaryType", Types.BinaryType.get()), + Types.NestedField.required(11, "decimalType1", Types.DecimalType.of(18, 3)), + Types.NestedField.required(12, "decimalType2", Types.DecimalType.of(10, 5)), + Types.NestedField.required(13, "decimalType3", Types.DecimalType.of(38, 19)), + Types.NestedField.required(14, "floatType", Types.FloatType.get()), + Types.NestedField.required(15, "doubleType", Types.DoubleType.get())); + + private static final List SUPPORTED_PRIMITIVES = + SCHEMA.asStruct().fields().stream().map(Types.NestedField::name).collect(Collectors.toList()); + + private static final Schema NESTED_SCHEMA = + new Schema( + Types.NestedField.required( + 1, + "structType", + Types.StructType.of( + Types.NestedField.optional(2, "innerStringType", Types.StringType.get()), + Types.NestedField.optional(3, "innerIntegerType", Types.IntegerType.get())))); + + @Test + public void testNullPartitionValue() { + Schema schema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); + + PartitionSpec spec = PartitionSpec.builderFor(schema).identity("data").build(); + + List rows = + Lists.newArrayList( + GenericRowData.of(1, StringData.fromString("a")), + GenericRowData.of(2, StringData.fromString("b")), + GenericRowData.of(3, null)); + + RowDataWrapper rowWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); + + for (RowData row : rows) { + PartitionKey partitionKey = new PartitionKey(spec, schema); + partitionKey.partition(rowWrapper.wrap(row)); + assertThat(partitionKey.size()).isEqualTo(1); + + String expectedStr = row.isNullAt(1) ? null : row.getString(1).toString(); + assertThat(partitionKey.get(0, String.class)).isEqualTo(expectedStr); + } + } + + @Test + public void testPartitionWithOneNestedField() { + RowDataWrapper rowWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); + List records = RandomGenericData.generate(NESTED_SCHEMA, 10, 1991); + List rows = Lists.newArrayList(RandomRowData.convert(NESTED_SCHEMA, records)); + + PartitionSpec spec1 = + PartitionSpec.builderFor(NESTED_SCHEMA).identity("structType.innerStringType").build(); + PartitionSpec spec2 = + PartitionSpec.builderFor(NESTED_SCHEMA).identity("structType.innerIntegerType").build(); + + for (int i = 0; i < rows.size(); i++) { + RowData row = rows.get(i); + Record record = (Record) records.get(i).get(0); + + PartitionKey partitionKey1 = new PartitionKey(spec1, NESTED_SCHEMA); + partitionKey1.partition(rowWrapper.wrap(row)); + assertThat(partitionKey1.size()).isEqualTo(1); + + assertThat(partitionKey1.get(0, String.class)).isEqualTo(record.get(0)); + + PartitionKey partitionKey2 = new PartitionKey(spec2, NESTED_SCHEMA); + partitionKey2.partition(rowWrapper.wrap(row)); + assertThat(partitionKey2.size()).isEqualTo(1); + + assertThat(partitionKey2.get(0, Integer.class)).isEqualTo(record.get(1)); + } + } + + @Test + public void testPartitionMultipleNestedField() { + RowDataWrapper rowWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); + List records = RandomGenericData.generate(NESTED_SCHEMA, 10, 1992); + List rows = Lists.newArrayList(RandomRowData.convert(NESTED_SCHEMA, records)); + + PartitionSpec spec1 = + PartitionSpec.builderFor(NESTED_SCHEMA) + .identity("structType.innerIntegerType") + .identity("structType.innerStringType") + .build(); + PartitionSpec spec2 = + PartitionSpec.builderFor(NESTED_SCHEMA) + .identity("structType.innerStringType") + .identity("structType.innerIntegerType") + .build(); + + PartitionKey pk1 = new PartitionKey(spec1, NESTED_SCHEMA); + PartitionKey pk2 = new PartitionKey(spec2, NESTED_SCHEMA); + + for (int i = 0; i < rows.size(); i++) { + RowData row = rows.get(i); + Record record = (Record) records.get(i).get(0); + + pk1.partition(rowWrapper.wrap(row)); + assertThat(pk1.size()).isEqualTo(2); + + assertThat(pk1.get(0, Integer.class)).isEqualTo(record.get(1)); + assertThat(pk1.get(1, String.class)).isEqualTo(record.get(0)); + + pk2.partition(rowWrapper.wrap(row)); + assertThat(pk2.size()).isEqualTo(2); + + assertThat(pk2.get(0, String.class)).isEqualTo(record.get(0)); + assertThat(pk2.get(1, Integer.class)).isEqualTo(record.get(1)); + } + } + + @Test + public void testPartitionValueTypes() { + RowType rowType = FlinkSchemaUtil.convert(SCHEMA); + RowDataWrapper rowWrapper = new RowDataWrapper(rowType, SCHEMA.asStruct()); + InternalRecordWrapper recordWrapper = new InternalRecordWrapper(SCHEMA.asStruct()); + + List records = RandomGenericData.generate(SCHEMA, 10, 1993); + List rows = Lists.newArrayList(RandomRowData.convert(SCHEMA, records)); + + for (String column : SUPPORTED_PRIMITIVES) { + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity(column).build(); + Class[] javaClasses = spec.javaClasses(); + + PartitionKey pk = new PartitionKey(spec, SCHEMA); + PartitionKey expectedPK = new PartitionKey(spec, SCHEMA); + + for (int j = 0; j < rows.size(); j++) { + RowData row = rows.get(j); + Record record = records.get(j); + + pk.partition(rowWrapper.wrap(row)); + expectedPK.partition(recordWrapper.wrap(record)); + + assertThat(pk.size()) + .as("Partition with column " + column + " should have one field.") + .isEqualTo(1); + + if (column.equals("timeType")) { + assertThat(pk.get(0, Long.class) / 1000) + .as("Partition with column " + column + " should have the expected values") + .isEqualTo(expectedPK.get(0, Long.class) / 1000); + } else { + assertThat(pk.get(0, javaClasses[0])) + .as("Partition with column " + column + " should have the expected values") + .isEqualTo(expectedPK.get(0, javaClasses[0])); + } + } + } + } + + @Test + public void testNestedPartitionValues() { + Schema nestedSchema = new Schema(Types.NestedField.optional(1001, "nested", SCHEMA.asStruct())); + RowType rowType = FlinkSchemaUtil.convert(nestedSchema); + + RowDataWrapper rowWrapper = new RowDataWrapper(rowType, nestedSchema.asStruct()); + InternalRecordWrapper recordWrapper = new InternalRecordWrapper(nestedSchema.asStruct()); + + List records = RandomGenericData.generate(nestedSchema, 10, 1994); + List rows = Lists.newArrayList(RandomRowData.convert(nestedSchema, records)); + + for (String supportedPrimitive : SUPPORTED_PRIMITIVES) { + String column = String.format("nested.%s", supportedPrimitive); + + PartitionSpec spec = PartitionSpec.builderFor(nestedSchema).identity(column).build(); + Class[] javaClasses = spec.javaClasses(); + + PartitionKey pk = new PartitionKey(spec, nestedSchema); + PartitionKey expectedPK = new PartitionKey(spec, nestedSchema); + + for (int j = 0; j < rows.size(); j++) { + pk.partition(rowWrapper.wrap(rows.get(j))); + expectedPK.partition(recordWrapper.wrap(records.get(j))); + + assertThat(pk.size()) + .as("Partition with nested column " + column + " should have one field.") + .isEqualTo(1); + + if (column.equals("nested.timeType")) { + assertThat(pk.get(0, Long.class) / 1000) + .as("Partition with nested column " + column + " should have the expected values.") + .isEqualTo(expectedPK.get(0, Long.class) / 1000); + } else { + assertThat(pk.get(0, javaClasses[0])) + .as("Partition with nested column " + column + " should have the expected values.") + .isEqualTo(expectedPK.get(0, javaClasses[0])); + } + } + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java new file mode 100644 index 000000000000..6b7b0d4c35a3 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.List; +import java.util.Map; +import org.apache.flink.table.data.RowData; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.data.RandomRowData; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestTaskWriters { + private static final Configuration CONF = new Configuration(); + private static final long TARGET_FILE_SIZE = 128 * 1024 * 1024; + + @TempDir protected java.nio.file.Path temporaryFolder; + + @Parameters(name = "format = {0}, partitioned = {1}") + public static Object[][] parameters() { + return new Object[][] { + {FileFormat.AVRO, true}, + {FileFormat.AVRO, false}, + {FileFormat.ORC, true}, + {FileFormat.ORC, false}, + {FileFormat.PARQUET, true}, + {FileFormat.PARQUET, false} + }; + } + + @Parameter(index = 0) + private FileFormat format; + + @Parameter(index = 1) + private boolean partitioned; + + private Table table; + + @BeforeEach + public void before() throws IOException { + File folder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); + // Construct the iceberg table with the specified file format. + Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); + table = SimpleDataUtil.createTable(folder.getAbsolutePath(), props, partitioned); + } + + @TestTemplate + public void testWriteZeroRecord() throws IOException { + try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { + taskWriter.close(); + + DataFile[] dataFiles = taskWriter.dataFiles(); + assertThat(dataFiles).isNotNull().isEmpty(); + + // Close again. + taskWriter.close(); + dataFiles = taskWriter.dataFiles(); + assertThat(dataFiles).isNotNull().isEmpty(); + } + } + + @TestTemplate + public void testCloseTwice() throws IOException { + try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { + taskWriter.write(SimpleDataUtil.createRowData(1, "hello")); + taskWriter.write(SimpleDataUtil.createRowData(2, "world")); + taskWriter.close(); // The first close + taskWriter.close(); // The second close + + int expectedFiles = partitioned ? 2 : 1; + DataFile[] dataFiles = taskWriter.dataFiles(); + assertThat(dataFiles).hasSize(expectedFiles); + + FileSystem fs = FileSystem.get(CONF); + for (DataFile dataFile : dataFiles) { + assertThat(fs.exists(new Path(dataFile.location()))).isTrue(); + } + } + } + + @TestTemplate + public void testAbort() throws IOException { + try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { + taskWriter.write(SimpleDataUtil.createRowData(1, "hello")); + taskWriter.write(SimpleDataUtil.createRowData(2, "world")); + + taskWriter.abort(); + DataFile[] dataFiles = taskWriter.dataFiles(); + + int expectedFiles = partitioned ? 2 : 1; + assertThat(dataFiles).hasSize(expectedFiles); + + FileSystem fs = FileSystem.get(CONF); + for (DataFile dataFile : dataFiles) { + assertThat(fs.exists(new Path(dataFile.location()))).isFalse(); + } + } + } + + @TestTemplate + public void testCompleteFiles() throws IOException { + try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { + taskWriter.write(SimpleDataUtil.createRowData(1, "a")); + taskWriter.write(SimpleDataUtil.createRowData(2, "b")); + taskWriter.write(SimpleDataUtil.createRowData(3, "c")); + taskWriter.write(SimpleDataUtil.createRowData(4, "d")); + + DataFile[] dataFiles = taskWriter.dataFiles(); + int expectedFiles = partitioned ? 4 : 1; + assertThat(dataFiles).hasSize(expectedFiles); + + dataFiles = taskWriter.dataFiles(); + assertThat(dataFiles).hasSize(expectedFiles); + + FileSystem fs = FileSystem.get(CONF); + for (DataFile dataFile : dataFiles) { + assertThat(fs.exists(new Path(dataFile.location()))).isTrue(); + } + + AppendFiles appendFiles = table.newAppend(); + for (DataFile dataFile : dataFiles) { + appendFiles.appendFile(dataFile); + } + appendFiles.commit(); + + // Assert the data rows. + SimpleDataUtil.assertTableRecords( + table, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "b"), + SimpleDataUtil.createRecord(3, "c"), + SimpleDataUtil.createRecord(4, "d"))); + } + } + + @TestTemplate + public void testRollingWithTargetFileSize() throws IOException { + try (TaskWriter taskWriter = createTaskWriter(4)) { + List rows = Lists.newArrayListWithCapacity(8000); + List records = Lists.newArrayListWithCapacity(8000); + for (int i = 0; i < 2000; i++) { + for (String data : new String[] {"a", "b", "c", "d"}) { + rows.add(SimpleDataUtil.createRowData(i, data)); + records.add(SimpleDataUtil.createRecord(i, data)); + } + } + + for (RowData row : rows) { + taskWriter.write(row); + } + + DataFile[] dataFiles = taskWriter.dataFiles(); + assertThat(dataFiles).hasSize(8); + + AppendFiles appendFiles = table.newAppend(); + for (DataFile dataFile : dataFiles) { + appendFiles.appendFile(dataFile); + } + appendFiles.commit(); + + // Assert the data rows. + SimpleDataUtil.assertTableRecords(table, records); + } + } + + @TestTemplate + public void testRandomData() throws IOException { + try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { + Iterable rows = RandomRowData.generate(SimpleDataUtil.SCHEMA, 100, 1996); + for (RowData row : rows) { + taskWriter.write(row); + } + + taskWriter.close(); + DataFile[] dataFiles = taskWriter.dataFiles(); + AppendFiles appendFiles = table.newAppend(); + for (DataFile dataFile : dataFiles) { + appendFiles.appendFile(dataFile); + } + appendFiles.commit(); + + // Assert the data rows. + SimpleDataUtil.assertTableRows(table, Lists.newArrayList(rows)); + } + } + + private TaskWriter createTaskWriter(long targetFileSize) { + TaskWriterFactory taskWriterFactory = + new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), + SimpleDataUtil.ROW_TYPE, + targetFileSize, + format, + table.properties(), + null, + false); + taskWriterFactory.initialize(1, 1); + return taskWriterFactory.create(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializerTestBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializerTestBase.java new file mode 100644 index 000000000000..30782e8d4170 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializerTestBase.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.apache.iceberg.types.Types.NestedField.required; + +import java.util.Collections; +import org.apache.flink.api.common.typeutils.SerializerTestBase; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.extension.RegisterExtension; + +/** + * Test base for DynamicRecordInternalSerializer which allows to instantiate different serializer + * version, e.g. with writing the schema itself or just the schema id. + */ +abstract class DynamicRecordInternalSerializerTestBase + extends SerializerTestBase { + + static final String TABLE = "myTable"; + static final String BRANCH = "myBranch"; + + @RegisterExtension + static final HadoopCatalogExtension CATALOG_EXTENSION = new HadoopCatalogExtension("db", TABLE); + + static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + required(3, "number", Types.FloatType.get())); + + static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).bucket("id", 10).build(); + + private boolean writeFullSchemaAndSpec; + + DynamicRecordInternalSerializerTestBase(boolean writeFullSchemaAndSpec) { + this.writeFullSchemaAndSpec = writeFullSchemaAndSpec; + } + + @Override + protected TypeSerializer createSerializer() { + return new DynamicRecordInternalSerializer( + new TableSerializerCache(CATALOG_EXTENSION.catalogLoader(), 1), writeFullSchemaAndSpec); + } + + @BeforeEach + void before() { + CATALOG_EXTENSION.catalog().createTable(TableIdentifier.parse(TABLE), SCHEMA, SPEC); + } + + @Override + protected DynamicRecordInternal[] getTestData() { + GenericRowData rowData = new GenericRowData(3); + rowData.setField(0, 123L); + rowData.setField(1, StringData.fromString("test")); + rowData.setField(2, 1.23f); + + return new DynamicRecordInternal[] { + new DynamicRecordInternal( + TABLE, BRANCH, SCHEMA, rowData, SPEC, 42, false, Collections.emptySet()) + }; + } + + @Override + protected Class getTypeClass() { + return DynamicRecordInternal.class; + } + + @Override + protected int getLength() { + return -1; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestCompareSchemasVisitor.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestCompareSchemasVisitor.java new file mode 100644 index 000000000000..385a354889fb --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestCompareSchemasVisitor.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Types.IntegerType; +import org.apache.iceberg.types.Types.ListType; +import org.apache.iceberg.types.Types.LongType; +import org.apache.iceberg.types.Types.MapType; +import org.apache.iceberg.types.Types.StringType; +import org.apache.iceberg.types.Types.StructType; +import org.junit.jupiter.api.Test; + +class TestCompareSchemasVisitor { + + @Test + void testSchema() { + assertThat( + CompareSchemasVisitor.visit( + new Schema( + optional(1, "id", IntegerType.get(), "comment"), + optional(2, "data", StringType.get()), + optional(3, "extra", StringType.get())), + new Schema( + optional(1, "id", IntegerType.get(), "comment"), + optional(2, "data", StringType.get()), + optional(3, "extra", StringType.get())))) + .isEqualTo(CompareSchemasVisitor.Result.SAME); + } + + @Test + void testSchemaDifferentId() { + assertThat( + CompareSchemasVisitor.visit( + new Schema( + optional(0, "id", IntegerType.get()), + optional(1, "data", StringType.get()), + optional(2, "extra", StringType.get())), + new Schema( + optional(1, "id", IntegerType.get()), + optional(2, "data", StringType.get()), + optional(3, "extra", StringType.get())))) + .isEqualTo(CompareSchemasVisitor.Result.SAME); + } + + @Test + void testSchemaDifferent() { + assertThat( + CompareSchemasVisitor.visit( + new Schema( + optional(0, "id", IntegerType.get()), + optional(1, "data", StringType.get()), + optional(2, "extra", StringType.get())), + new Schema( + optional(0, "id", IntegerType.get()), optional(1, "data", StringType.get())))) + .isEqualTo(CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED); + } + + @Test + void testSchemaWithMoreColumns() { + assertThat( + CompareSchemasVisitor.visit( + new Schema( + optional(0, "id", IntegerType.get()), optional(1, "data", StringType.get())), + new Schema( + optional(0, "id", IntegerType.get()), + optional(1, "data", StringType.get()), + optional(2, "extra", StringType.get())))) + .isEqualTo(CompareSchemasVisitor.Result.DATA_CONVERSION_NEEDED); + } + + @Test + void testDifferentType() { + assertThat( + CompareSchemasVisitor.visit( + new Schema( + optional(1, "id", LongType.get()), optional(2, "extra", StringType.get())), + new Schema( + optional(1, "id", IntegerType.get()), optional(2, "extra", StringType.get())))) + .isEqualTo(CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED); + } + + @Test + void testCompatibleType() { + assertThat( + CompareSchemasVisitor.visit( + new Schema( + optional(1, "id", IntegerType.get()), optional(2, "extra", StringType.get())), + new Schema( + optional(1, "id", LongType.get()), optional(2, "extra", StringType.get())))) + .isEqualTo(CompareSchemasVisitor.Result.DATA_CONVERSION_NEEDED); + } + + @Test + void testRequiredChangeForMatchingField() { + Schema dataSchema = + new Schema(optional(1, "id", IntegerType.get()), optional(2, "extra", StringType.get())); + Schema tableSchema = + new Schema(required(1, "id", IntegerType.get()), optional(2, "extra", StringType.get())); + assertThat(CompareSchemasVisitor.visit(dataSchema, tableSchema)) + .isEqualTo(CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED); + assertThat(CompareSchemasVisitor.visit(tableSchema, dataSchema)) + .isEqualTo(CompareSchemasVisitor.Result.SAME); + } + + @Test + void testRequiredChangeForNonMatchingField() { + Schema dataSchema = new Schema(optional(1, "id", IntegerType.get())); + Schema tableSchema = + new Schema(optional(1, "id", IntegerType.get()), required(2, "extra", StringType.get())); + assertThat(CompareSchemasVisitor.visit(dataSchema, tableSchema)) + .isEqualTo(CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED); + assertThat(CompareSchemasVisitor.visit(tableSchema, dataSchema)) + .isEqualTo(CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED); + } + + @Test + void testNoRequiredChangeForNonMatchingField() { + Schema dataSchema = new Schema(required(1, "id", IntegerType.get())); + Schema tableSchema = + new Schema(required(1, "id", IntegerType.get()), optional(2, "extra", StringType.get())); + assertThat(CompareSchemasVisitor.visit(dataSchema, tableSchema)) + .isEqualTo(CompareSchemasVisitor.Result.DATA_CONVERSION_NEEDED); + } + + @Test + void testStructDifferentId() { + assertThat( + CompareSchemasVisitor.visit( + new Schema( + optional(1, "id", IntegerType.get()), + optional(2, "struct1", StructType.of(optional(3, "extra", IntegerType.get())))), + new Schema( + optional(0, "id", IntegerType.get()), + optional( + 1, "struct1", StructType.of(optional(2, "extra", IntegerType.get())))))) + .isEqualTo(CompareSchemasVisitor.Result.SAME); + } + + @Test + void testStructChanged() { + assertThat( + CompareSchemasVisitor.visit( + new Schema( + optional(0, "id", IntegerType.get()), + optional(1, "struct1", StructType.of(optional(2, "extra", LongType.get())))), + new Schema( + optional(1, "id", IntegerType.get()), + optional( + 2, "struct1", StructType.of(optional(3, "extra", IntegerType.get())))))) + .isEqualTo(CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED); + } + + @Test + void testMapDifferentId() { + assertThat( + CompareSchemasVisitor.visit( + new Schema( + optional(1, "id", IntegerType.get()), + optional( + 2, "map1", MapType.ofOptional(3, 4, IntegerType.get(), StringType.get()))), + new Schema( + optional(0, "id", IntegerType.get()), + optional( + 1, "map1", MapType.ofOptional(2, 3, IntegerType.get(), StringType.get()))))) + .isEqualTo(CompareSchemasVisitor.Result.SAME); + } + + @Test + void testMapChanged() { + assertThat( + CompareSchemasVisitor.visit( + new Schema( + optional(1, "id", IntegerType.get()), + optional( + 2, "map1", MapType.ofOptional(3, 4, LongType.get(), StringType.get()))), + new Schema( + optional(1, "id", IntegerType.get()), + optional( + 2, "map1", MapType.ofOptional(3, 4, IntegerType.get(), StringType.get()))))) + .isEqualTo(CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED); + } + + @Test + void testListDifferentId() { + assertThat( + CompareSchemasVisitor.visit( + new Schema( + optional(1, "id", IntegerType.get()), + optional(2, "list1", ListType.ofOptional(3, IntegerType.get()))), + new Schema( + optional(0, "id", IntegerType.get()), + optional(1, "list1", ListType.ofOptional(2, IntegerType.get()))))) + .isEqualTo(CompareSchemasVisitor.Result.SAME); + } + + @Test + void testListChanged() { + assertThat( + CompareSchemasVisitor.visit( + new Schema( + optional(0, "id", IntegerType.get()), + optional(1, "list1", ListType.ofOptional(2, LongType.get()))), + new Schema( + optional(1, "id", IntegerType.get()), + optional(2, "list1", ListType.ofOptional(3, IntegerType.get()))))) + .isEqualTo(CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommittableSerializer.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommittableSerializer.java new file mode 100644 index 000000000000..13a06d362717 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommittableSerializer.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import org.apache.flink.api.common.JobID; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.junit.jupiter.api.Test; + +class TestDynamicCommittableSerializer { + + @Test + void testRoundtrip() throws IOException { + DynamicCommittable committable = + new DynamicCommittable( + new WriteTarget("table", "branch", 42, 23, false, Sets.newHashSet(1, 2)), + new byte[] {3, 4}, + JobID.generate().toHexString(), + new OperatorID().toHexString(), + 5); + + DynamicCommittableSerializer serializer = new DynamicCommittableSerializer(); + assertThat(serializer.deserialize(serializer.getVersion(), serializer.serialize(committable))) + .isEqualTo(committable); + } + + @Test + void testUnsupportedVersion() throws IOException { + DynamicCommittable committable = + new DynamicCommittable( + new WriteTarget("table", "branch", 42, 23, false, Sets.newHashSet(1, 2)), + new byte[] {3, 4}, + JobID.generate().toHexString(), + new OperatorID().toHexString(), + 5); + + DynamicCommittableSerializer serializer = new DynamicCommittableSerializer(); + assertThatThrownBy(() -> serializer.deserialize(-1, serializer.serialize(committable))) + .hasMessage("Unrecognized version or corrupt state: -1") + .isInstanceOf(IOException.class); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommitter.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommitter.java new file mode 100644 index 000000000000..99a546536208 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommitter.java @@ -0,0 +1,381 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.ByteBuffer; +import java.util.Map; +import org.apache.flink.api.common.JobID; +import org.apache.flink.api.connector.sink2.Committer.CommitRequest; +import org.apache.flink.api.connector.sink2.mocks.MockCommitRequest; +import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; + +class TestDynamicCommitter { + + static final String DB = "db"; + static final String TABLE1 = "table"; + static final String TABLE2 = "table2"; + + @RegisterExtension + static final HadoopCatalogExtension CATALOG_EXTENSION = new HadoopCatalogExtension(DB, TABLE1); + + Catalog catalog; + + private static final DataFile DATA_FILE = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(0) + .withMetrics( + new Metrics( + 42L, + null, // no column sizes + ImmutableMap.of(1, 5L), // value count + ImmutableMap.of(1, 0L), // null count + null, + ImmutableMap.of(1, ByteBuffer.allocate(1)), // lower bounds + ImmutableMap.of(1, ByteBuffer.allocate(1)) // upper bounds + )) + .build(); + + @BeforeEach + void before() { + catalog = CATALOG_EXTENSION.catalog(); + Schema schema1 = new Schema(42); + Schema schema2 = new Schema(43); + catalog.createTable(TableIdentifier.of(TABLE1), schema1); + catalog.createTable(TableIdentifier.of(TABLE2), schema2); + } + + @Test + void testCommit() throws Exception { + Table table1 = catalog.loadTable(TableIdentifier.of(TABLE1)); + assertThat(table1.snapshots()).isEmpty(); + Table table2 = catalog.loadTable(TableIdentifier.of(TABLE2)); + assertThat(table2.snapshots()).isEmpty(); + + boolean overwriteMode = false; + int workerPoolSize = 1; + String sinkId = "sinkId"; + UnregisteredMetricsGroup metricGroup = new UnregisteredMetricsGroup(); + DynamicCommitterMetrics committerMetrics = new DynamicCommitterMetrics(metricGroup); + DynamicCommitter dynamicCommitter = + new DynamicCommitter( + CATALOG_EXTENSION.catalog(), + Maps.newHashMap(), + overwriteMode, + workerPoolSize, + sinkId, + committerMetrics); + + WriteTarget writeTarget1 = + new WriteTarget(TABLE1, "branch", 42, 0, true, Sets.newHashSet(1, 2)); + WriteTarget writeTarget2 = + new WriteTarget(TABLE1, "branch2", 43, 0, true, Sets.newHashSet(1, 2)); + WriteTarget writeTarget3 = + new WriteTarget(TABLE2, "branch2", 43, 0, true, Sets.newHashSet(1, 2)); + + DynamicWriteResultAggregator aggregator = + new DynamicWriteResultAggregator(CATALOG_EXTENSION.catalogLoader()); + OneInputStreamOperatorTestHarness aggregatorHarness = + new OneInputStreamOperatorTestHarness(aggregator); + aggregatorHarness.open(); + + byte[] deltaManifest1 = + aggregator.writeToManifest( + writeTarget1, + Sets.newHashSet( + new DynamicWriteResult( + writeTarget1, WriteResult.builder().addDataFiles(DATA_FILE).build())), + 0); + byte[] deltaManifest2 = + aggregator.writeToManifest( + writeTarget2, + Sets.newHashSet( + new DynamicWriteResult( + writeTarget2, WriteResult.builder().addDataFiles(DATA_FILE).build())), + 0); + byte[] deltaManifest3 = + aggregator.writeToManifest( + writeTarget3, + Sets.newHashSet( + new DynamicWriteResult( + writeTarget3, WriteResult.builder().addDataFiles(DATA_FILE).build())), + 0); + + final String jobId = JobID.generate().toHexString(); + final String operatorId = new OperatorID().toHexString(); + final int checkpointId = 10; + + CommitRequest commitRequest1 = + new MockCommitRequest<>( + new DynamicCommittable(writeTarget1, deltaManifest1, jobId, operatorId, checkpointId)); + + CommitRequest commitRequest2 = + new MockCommitRequest<>( + new DynamicCommittable(writeTarget2, deltaManifest2, jobId, operatorId, checkpointId)); + + CommitRequest commitRequest3 = + new MockCommitRequest<>( + new DynamicCommittable(writeTarget3, deltaManifest3, jobId, operatorId, checkpointId)); + + dynamicCommitter.commit(Sets.newHashSet(commitRequest1, commitRequest2, commitRequest3)); + + table1.refresh(); + assertThat(table1.snapshots()).hasSize(2); + Snapshot first = Iterables.getFirst(table1.snapshots(), null); + assertThat(first.summary()) + .containsAllEntriesOf( + (Map) + ImmutableMap.builder() + .put("added-data-files", "1") + .put("added-records", "42") + .put("changed-partition-count", "1") + .put("flink.job-id", jobId) + .put("flink.max-committed-checkpoint-id", "" + checkpointId) + .put("flink.operator-id", operatorId) + .put("total-data-files", "1") + .put("total-delete-files", "0") + .put("total-equality-deletes", "0") + .put("total-files-size", "0") + .put("total-position-deletes", "0") + .put("total-records", "42") + .build()); + Snapshot second = Iterables.get(table1.snapshots(), 1, null); + assertThat(second.summary()) + .containsAllEntriesOf( + (Map) + ImmutableMap.builder() + .put("added-data-files", "1") + .put("added-records", "42") + .put("changed-partition-count", "1") + .put("flink.job-id", jobId) + .put("flink.max-committed-checkpoint-id", "" + checkpointId) + .put("flink.operator-id", operatorId) + .put("total-data-files", "1") + .put("total-delete-files", "0") + .put("total-equality-deletes", "0") + .put("total-files-size", "0") + .put("total-position-deletes", "0") + .put("total-records", "42") + .build()); + + table2.refresh(); + assertThat(table2.snapshots()).hasSize(1); + Snapshot third = Iterables.getFirst(table2.snapshots(), null); + assertThat(third.summary()) + .containsAllEntriesOf( + (Map) + ImmutableMap.builder() + .put("added-data-files", "1") + .put("added-records", "42") + .put("changed-partition-count", "1") + .put("flink.job-id", jobId) + .put("flink.max-committed-checkpoint-id", "" + checkpointId) + .put("flink.operator-id", operatorId) + .put("total-data-files", "1") + .put("total-delete-files", "0") + .put("total-equality-deletes", "0") + .put("total-files-size", "0") + .put("total-position-deletes", "0") + .put("total-records", "42") + .build()); + } + + @Test + void testAlreadyCommitted() throws Exception { + Table table1 = catalog.loadTable(TableIdentifier.of(TABLE1)); + assertThat(table1.snapshots()).isEmpty(); + + boolean overwriteMode = false; + int workerPoolSize = 1; + String sinkId = "sinkId"; + UnregisteredMetricsGroup metricGroup = new UnregisteredMetricsGroup(); + DynamicCommitterMetrics committerMetrics = new DynamicCommitterMetrics(metricGroup); + DynamicCommitter dynamicCommitter = + new DynamicCommitter( + CATALOG_EXTENSION.catalog(), + Maps.newHashMap(), + overwriteMode, + workerPoolSize, + sinkId, + committerMetrics); + + WriteTarget writeTarget = + new WriteTarget(TABLE1, "branch", 42, 0, false, Sets.newHashSet(1, 2)); + + DynamicWriteResultAggregator aggregator = + new DynamicWriteResultAggregator(CATALOG_EXTENSION.catalogLoader()); + OneInputStreamOperatorTestHarness aggregatorHarness = + new OneInputStreamOperatorTestHarness(aggregator); + aggregatorHarness.open(); + + final String jobId = JobID.generate().toHexString(); + final String operatorId = new OperatorID().toHexString(); + final int checkpointId = 10; + + byte[] deltaManifest = + aggregator.writeToManifest( + writeTarget, + Sets.newHashSet( + new DynamicWriteResult( + writeTarget, WriteResult.builder().addDataFiles(DATA_FILE).build())), + checkpointId); + + CommitRequest commitRequest = + new MockCommitRequest<>( + new DynamicCommittable(writeTarget, deltaManifest, jobId, operatorId, checkpointId)); + + dynamicCommitter.commit(Sets.newHashSet(commitRequest)); + + CommitRequest oldCommitRequest = + new MockCommitRequest<>( + new DynamicCommittable( + writeTarget, deltaManifest, jobId, operatorId, checkpointId - 1)); + + // Old commits requests shouldn't affect the result + dynamicCommitter.commit(Sets.newHashSet(oldCommitRequest)); + + table1.refresh(); + assertThat(table1.snapshots()).hasSize(1); + Snapshot first = Iterables.getFirst(table1.snapshots(), null); + assertThat(first.summary()) + .containsAllEntriesOf( + (Map) + ImmutableMap.builder() + .put("added-data-files", "1") + .put("added-records", "42") + .put("changed-partition-count", "1") + .put("flink.job-id", jobId) + .put("flink.max-committed-checkpoint-id", "" + checkpointId) + .put("flink.operator-id", operatorId) + .put("total-data-files", "1") + .put("total-delete-files", "0") + .put("total-equality-deletes", "0") + .put("total-files-size", "0") + .put("total-position-deletes", "0") + .put("total-records", "42") + .build()); + } + + @Test + void testReplacePartitions() throws Exception { + Table table1 = catalog.loadTable(TableIdentifier.of(TABLE1)); + assertThat(table1.snapshots()).isEmpty(); + + // Overwrite mode is active + boolean overwriteMode = true; + int workerPoolSize = 1; + String sinkId = "sinkId"; + UnregisteredMetricsGroup metricGroup = new UnregisteredMetricsGroup(); + DynamicCommitterMetrics committerMetrics = new DynamicCommitterMetrics(metricGroup); + DynamicCommitter dynamicCommitter = + new DynamicCommitter( + CATALOG_EXTENSION.catalog(), + Maps.newHashMap(), + overwriteMode, + workerPoolSize, + sinkId, + committerMetrics); + + WriteTarget writeTarget = + new WriteTarget(TABLE1, "branch", 42, 0, false, Sets.newHashSet(1, 2)); + + DynamicWriteResultAggregator aggregator = + new DynamicWriteResultAggregator(CATALOG_EXTENSION.catalogLoader()); + OneInputStreamOperatorTestHarness aggregatorHarness = + new OneInputStreamOperatorTestHarness(aggregator); + aggregatorHarness.open(); + + final String jobId = JobID.generate().toHexString(); + final String operatorId = new OperatorID().toHexString(); + final int checkpointId = 10; + + byte[] deltaManifest = + aggregator.writeToManifest( + writeTarget, + Sets.newHashSet( + new DynamicWriteResult( + writeTarget, WriteResult.builder().addDataFiles(DATA_FILE).build())), + checkpointId); + + CommitRequest commitRequest = + new MockCommitRequest<>( + new DynamicCommittable(writeTarget, deltaManifest, jobId, operatorId, checkpointId)); + + dynamicCommitter.commit(Sets.newHashSet(commitRequest)); + + byte[] overwriteManifest = + aggregator.writeToManifest( + writeTarget, + Sets.newHashSet( + new DynamicWriteResult( + writeTarget, WriteResult.builder().addDataFiles(DATA_FILE).build())), + checkpointId + 1); + + CommitRequest overwriteRequest = + new MockCommitRequest<>( + new DynamicCommittable( + writeTarget, overwriteManifest, jobId, operatorId, checkpointId + 1)); + + dynamicCommitter.commit(Sets.newHashSet(overwriteRequest)); + + table1.refresh(); + assertThat(table1.snapshots()).hasSize(2); + Snapshot latestSnapshot = Iterables.getLast(table1.snapshots()); + assertThat(latestSnapshot.summary()) + .containsAllEntriesOf( + (Map) + ImmutableMap.builder() + .put("replace-partitions", "true") + .put("added-data-files", "1") + .put("added-records", "42") + .put("changed-partition-count", "1") + .put("flink.job-id", jobId) + .put("flink.max-committed-checkpoint-id", String.valueOf(checkpointId + 1)) + .put("flink.operator-id", operatorId) + .put("total-data-files", "1") + .put("total-delete-files", "0") + .put("total-equality-deletes", "0") + .put("total-files-size", "0") + .put("total-position-deletes", "0") + .put("total-records", "42") + .build()); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java new file mode 100644 index 000000000000..b61e297cc140 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java @@ -0,0 +1,850 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.apache.iceberg.flink.TestFixtures.DATABASE; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.fail; + +import java.io.IOException; +import java.io.Serializable; +import java.time.Duration; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import javax.annotation.Nullable; +import org.apache.flink.api.common.typeinfo.TypeHint; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.sink2.Committer; +import org.apache.flink.api.connector.sink2.CommitterInitContext; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.RestartStrategyOptions; +import org.apache.flink.runtime.client.JobExecutionException; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.util.DataFormatConverters; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.flink.util.Collector; +import org.apache.flink.util.ExceptionUtils; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SnapshotUpdate; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.IcebergGenerics; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.CatalogLoader; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.flink.sink.CommitSummary; +import org.apache.iceberg.flink.sink.TestFlinkIcebergSinkBase; +import org.apache.iceberg.inmemory.InMemoryInputFile; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +class TestDynamicIcebergSink extends TestFlinkIcebergSinkBase { + + private static long seed; + + @BeforeEach + void before() { + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(2); + seed = 0; + } + + private static class DynamicIcebergDataImpl implements Serializable { + Row rowProvided; + Row rowExpected; + Schema schemaProvided; + Schema schemaExpected; + String tableName; + String branch; + PartitionSpec partitionSpec; + boolean upsertMode; + Set equalityFields; + + private DynamicIcebergDataImpl( + Schema schemaProvided, String tableName, String branch, PartitionSpec partitionSpec) { + this( + schemaProvided, + schemaProvided, + tableName, + branch, + partitionSpec, + false, + Collections.emptySet(), + false); + } + + private DynamicIcebergDataImpl( + Schema schemaProvided, + Schema schemaExpected, + String tableName, + String branch, + PartitionSpec partitionSpec) { + this( + schemaProvided, + schemaExpected, + tableName, + branch, + partitionSpec, + false, + Collections.emptySet(), + false); + } + + private DynamicIcebergDataImpl( + Schema schemaProvided, + String tableName, + String branch, + PartitionSpec partitionSpec, + boolean upsertMode, + Set equalityFields, + boolean isDuplicate) { + this( + schemaProvided, + schemaProvided, + tableName, + branch, + partitionSpec, + upsertMode, + equalityFields, + isDuplicate); + } + + private DynamicIcebergDataImpl( + Schema schemaProvided, + Schema schemaExpected, + String tableName, + String branch, + PartitionSpec partitionSpec, + boolean upsertMode, + Set equalityFields, + boolean isDuplicate) { + this.rowProvided = randomRow(schemaProvided, isDuplicate ? seed : ++seed); + this.rowExpected = isDuplicate ? null : rowProvided; + this.schemaProvided = schemaProvided; + this.schemaExpected = schemaExpected; + this.tableName = tableName; + this.branch = branch; + this.partitionSpec = partitionSpec; + this.upsertMode = upsertMode; + this.equalityFields = equalityFields; + } + } + + private static class Generator implements DynamicRecordGenerator { + + @Override + public void generate(DynamicIcebergDataImpl row, Collector out) { + TableIdentifier tableIdentifier = TableIdentifier.of(DATABASE, row.tableName); + String branch = row.branch; + Schema schema = row.schemaProvided; + PartitionSpec spec = row.partitionSpec; + DynamicRecord dynamicRecord = + new DynamicRecord( + tableIdentifier, + branch, + schema, + converter(schema).toInternal(row.rowProvided), + spec, + spec.isPartitioned() ? DistributionMode.HASH : DistributionMode.NONE, + 10); + dynamicRecord.setUpsertMode(row.upsertMode); + dynamicRecord.setEqualityFields(row.equalityFields); + out.collect(dynamicRecord); + } + } + + private static DataFormatConverters.RowConverter converter(Schema schema) { + RowType rowType = FlinkSchemaUtil.convert(schema); + ResolvedSchema resolvedSchema = FlinkSchemaUtil.toResolvedSchema(rowType); + return new DataFormatConverters.RowConverter( + resolvedSchema.getColumnDataTypes().toArray(DataType[]::new)); + } + + @Test + void testWrite() throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned())); + + runTest(rows); + } + + @Test + void testWritePartitioned() throws Exception { + PartitionSpec spec = PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("id", 10).build(); + + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec), + new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec), + new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec), + new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec), + new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec)); + + runTest(rows); + } + + @Test + void testWritePartitionedAdjustSchemaIdsInSpec() throws Exception { + Schema schema = + new Schema( + // Use zero-based schema field ids + Types.NestedField.required(0, "id", Types.IntegerType.get()), + Types.NestedField.required(1, "data", Types.StringType.get())); + PartitionSpec spec = PartitionSpec.builderFor(schema).bucket("id", 10).build(); + Schema schema2 = + new Schema( + // Use zero-based schema field ids + Types.NestedField.required(0, "id", Types.IntegerType.get()), + Types.NestedField.required(1, "data", Types.StringType.get()), + Types.NestedField.optional(2, "extra", Types.StringType.get())); + PartitionSpec spec2 = PartitionSpec.builderFor(schema2).bucket("extra", 23).build(); + + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl(schema, "t1", "main", spec), + new DynamicIcebergDataImpl(schema, "t1", "main", spec), + new DynamicIcebergDataImpl(schema, "t1", "main", spec), + new DynamicIcebergDataImpl(schema2, "t1", "main", spec2), + new DynamicIcebergDataImpl(schema2, "t1", "main", spec2)); + + runTest(rows); + } + + @Test + void testSchemaEvolutionFieldOrderChanges() throws Exception { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.IntegerType.get()), + Types.NestedField.required(1, "data", Types.StringType.get())); + Schema expectedSchema = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + + Schema schema2 = + new Schema( + Types.NestedField.required(0, "id", Types.IntegerType.get()), + Types.NestedField.required(1, "extra", Types.StringType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + Schema expectedSchema2 = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(3, "extra", Types.StringType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + schema, expectedSchema, "t1", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + schema, expectedSchema, "t1", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + schema, expectedSchema, "t1", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + schema2, expectedSchema2, "t1", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + schema2, expectedSchema2, "t1", "main", PartitionSpec.unpartitioned())); + + for (DynamicIcebergDataImpl row : rows) { + if (row.schemaExpected == expectedSchema) { + // We manually adjust the expected Row to match the second expected schema + row.rowExpected = Row.of(row.rowProvided.getField(0), null, row.rowProvided.getField(1)); + } + } + + runTest(rows); + } + + @Test + void testMultipleTables() throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t2", "main", PartitionSpec.unpartitioned())); + + runTest(rows); + } + + @Test + void testMultipleTablesPartitioned() throws Exception { + PartitionSpec spec = PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("id", 10).build(); + + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec), + new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t2", "main", spec)); + + runTest(rows); + } + + @Test + void testSchemaEvolutionAddField() throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA2, "t1", "main", PartitionSpec.unpartitioned())); + + runTest(rows, this.env, 1); + } + + @Test + void testRowEvolutionNullMissingOptionalField() throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA2, "t1", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned())); + + runTest(rows, this.env, 1); + } + + @Test + void testRowEvolutionMakeMissingRequiredFieldOptional() throws Exception { + Schema existingSchemaWithRequiredField = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + + CATALOG_EXTENSION + .catalog() + .createTable(TableIdentifier.of(DATABASE, "t1"), existingSchemaWithRequiredField); + + Schema writeSchemaWithoutRequiredField = + new Schema(Types.NestedField.optional(1, "id", Types.IntegerType.get())); + + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + writeSchemaWithoutRequiredField, + existingSchemaWithRequiredField, + "t1", + "main", + PartitionSpec.unpartitioned())); + + runTest(rows, this.env, 1); + } + + @Test + void testSchemaEvolutionNonBackwardsCompatible() throws Exception { + Schema initialSchema = new Schema(Types.NestedField.required(1, "id", Types.IntegerType.get())); + // Type change is not allowed + Schema erroringSchema = new Schema(Types.NestedField.required(1, "id", Types.StringType.get())); + + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl(initialSchema, "t1", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + erroringSchema, "t1", "main", PartitionSpec.unpartitioned())); + + try { + runTest(rows, StreamExecutionEnvironment.getExecutionEnvironment(), 1); + fail(); + } catch (JobExecutionException e) { + assertThat( + ExceptionUtils.findThrowable( + e, t -> t.getMessage().contains("Cannot change column type: id: int -> string"))) + .isNotEmpty(); + } + } + + @Test + void testPartitionSpecEvolution() throws Exception { + PartitionSpec spec1 = PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("id", 10).build(); + PartitionSpec spec2 = + PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("id", 5).identity("data").build(); + + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec1), + new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec2), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec1), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec2), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec1), + new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec2)); + + runTest(rows); + } + + @Test + void testMultipleBranches() throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", "branch1", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned())); + + runTest(rows); + } + + @Test + void testWriteMultipleTablesWithSchemaChanges() throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t2", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA2, "t2", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t2", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA2, "t2", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned())); + + runTest(rows); + } + + @Test + void testUpsert() throws Exception { + List rows = + Lists.newArrayList( + // Insert one rows + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + "main", + PartitionSpec.unpartitioned(), + true, + Sets.newHashSet("id"), + false), + // Remaining rows are duplicates + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + "main", + PartitionSpec.unpartitioned(), + true, + Sets.newHashSet("id"), + true), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + "main", + PartitionSpec.unpartitioned(), + true, + Sets.newHashSet("id"), + true), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + "main", + PartitionSpec.unpartitioned(), + true, + Sets.newHashSet("id"), + true)); + + executeDynamicSink(rows, env, true, 1, null); + + try (CloseableIterable iterable = + IcebergGenerics.read( + CATALOG_EXTENSION.catalog().loadTable(TableIdentifier.of("default", "t1"))) + .build()) { + List records = Lists.newArrayList(); + for (Record record : iterable) { + records.add(record); + } + + assertThat(records).hasSize(1); + Record actual = records.get(0); + DynamicIcebergDataImpl input = rows.get(0); + assertThat(actual.get(0)).isEqualTo(input.rowProvided.getField(0)); + assertThat(actual.get(1)).isEqualTo(input.rowProvided.getField(1)); + // There is an additional _pos field which gets added + } + } + + @Test + void testCommitFailedBeforeOrAfterCommit() throws Exception { + // Configure a Restart strategy to allow recovery + Configuration configuration = new Configuration(); + configuration.set(RestartStrategyOptions.RESTART_STRATEGY, "fixed-delay"); + configuration.set(RestartStrategyOptions.RESTART_STRATEGY_FIXED_DELAY_ATTEMPTS, 2); + configuration.set(RestartStrategyOptions.RESTART_STRATEGY_FIXED_DELAY_DELAY, Duration.ZERO); + env.configure(configuration); + + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t2", "main", PartitionSpec.unpartitioned())); + + FailBeforeAndAfterCommit.reset(); + final CommitHook commitHook = new FailBeforeAndAfterCommit(); + assertThat(FailBeforeAndAfterCommit.failedBeforeCommit).isFalse(); + assertThat(FailBeforeAndAfterCommit.failedAfterCommit).isFalse(); + + executeDynamicSink(rows, env, true, 1, commitHook); + + assertThat(FailBeforeAndAfterCommit.failedBeforeCommit).isTrue(); + assertThat(FailBeforeAndAfterCommit.failedAfterCommit).isTrue(); + } + + @Test + void testCommitConcurrency() throws Exception { + + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t2", "main", PartitionSpec.unpartitioned())); + + TableIdentifier tableIdentifier = TableIdentifier.of("default", "t1"); + Catalog catalog = CATALOG_EXTENSION.catalog(); + catalog.createTable(tableIdentifier, new Schema()); + + final CommitHook commitHook = new AppendRightBeforeCommit(tableIdentifier.toString()); + + executeDynamicSink(rows, env, true, 1, commitHook); + } + + interface CommitHook extends Serializable { + void beforeCommit(); + + void duringCommit(); + + void afterCommit(); + } + + private static class FailBeforeAndAfterCommit implements CommitHook { + + static boolean failedBeforeCommit; + static boolean failedAfterCommit; + + @Override + public void beforeCommit() { + if (!failedBeforeCommit) { + failedBeforeCommit = true; + throw new RuntimeException("Failing before commit"); + } + } + + @Override + public void duringCommit() {} + + @Override + public void afterCommit() { + if (!failedAfterCommit) { + failedAfterCommit = true; + throw new RuntimeException("Failing before commit"); + } + } + + static void reset() { + failedBeforeCommit = false; + failedAfterCommit = false; + } + } + + private static class AppendRightBeforeCommit implements CommitHook { + + final String tableIdentifier; + + private AppendRightBeforeCommit(String tableIdentifier) { + this.tableIdentifier = tableIdentifier; + } + + @Override + public void beforeCommit() {} + + @Override + public void duringCommit() { + // Create a conflict + Table table = CATALOG_EXTENSION.catalog().loadTable(TableIdentifier.parse(tableIdentifier)); + DataFile dataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withInputFile(new InMemoryInputFile(new byte[] {1, 2, 3})) + .withFormat(FileFormat.AVRO) + .withRecordCount(3) + .build(); + table.newAppend().appendFile(dataFile).commit(); + } + + @Override + public void afterCommit() {} + } + + private void runTest(List dynamicData) throws Exception { + runTest(dynamicData, this.env, 2); + } + + private void runTest( + List dynamicData, StreamExecutionEnvironment env, int parallelism) + throws Exception { + runTest(dynamicData, env, true, parallelism); + runTest(dynamicData, env, false, parallelism); + } + + private void runTest( + List dynamicData, + StreamExecutionEnvironment env, + boolean immediateUpdate, + int parallelism) + throws Exception { + executeDynamicSink(dynamicData, env, immediateUpdate, parallelism, null); + verifyResults(dynamicData); + } + + private void executeDynamicSink( + List dynamicData, + StreamExecutionEnvironment env, + boolean immediateUpdate, + int parallelism, + @Nullable CommitHook commitHook) + throws Exception { + DataStream dataStream = + env.addSource(createBoundedSource(dynamicData), TypeInformation.of(new TypeHint<>() {})); + env.setParallelism(parallelism); + + if (commitHook != null) { + new CommitHookEnabledDynamicIcebergSink(commitHook) + .forInput(dataStream) + .generator(new Generator()) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .writeParallelism(parallelism) + .immediateTableUpdate(immediateUpdate) + .setSnapshotProperty("commit.retry.num-retries", "0") + .append(); + } else { + DynamicIcebergSink.forInput(dataStream) + .generator(new Generator()) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .writeParallelism(parallelism) + .immediateTableUpdate(immediateUpdate) + .append(); + } + + // Write the data + env.execute("Test Iceberg DataStream"); + } + + static class CommitHookEnabledDynamicIcebergSink extends DynamicIcebergSink.Builder { + private final CommitHook commitHook; + + CommitHookEnabledDynamicIcebergSink(CommitHook commitHook) { + this.commitHook = commitHook; + } + + @Override + DynamicIcebergSink instantiateSink( + Map writeProperties, FlinkWriteConf flinkWriteConf) { + return new CommitHookDynamicIcebergSink( + commitHook, + CATALOG_EXTENSION.catalogLoader(), + Collections.emptyMap(), + "uidPrefix", + writeProperties, + flinkWriteConf, + 100); + } + } + + static class CommitHookDynamicIcebergSink extends DynamicIcebergSink { + + private final CommitHook commitHook; + + CommitHookDynamicIcebergSink( + CommitHook commitHook, + CatalogLoader catalogLoader, + Map snapshotProperties, + String uidPrefix, + Map writeProperties, + FlinkWriteConf flinkWriteConf, + int cacheMaximumSize) { + super( + catalogLoader, + snapshotProperties, + uidPrefix, + writeProperties, + flinkWriteConf, + cacheMaximumSize); + this.commitHook = commitHook; + } + + @Override + public Committer createCommitter(CommitterInitContext context) { + // return super.createCommitter(context); + return new CommitHookEnabledDynamicCommitter( + commitHook, + CATALOG_EXTENSION.catalogLoader().loadCatalog(), + Collections.emptyMap(), + false, + 10, + "sinkId", + new DynamicCommitterMetrics(context.metricGroup())); + } + } + + static class CommitHookEnabledDynamicCommitter extends DynamicCommitter { + private final CommitHook commitHook; + + CommitHookEnabledDynamicCommitter( + CommitHook commitHook, + Catalog catalog, + Map snapshotProperties, + boolean replacePartitions, + int workerPoolSize, + String sinkId, + DynamicCommitterMetrics committerMetrics) { + super( + catalog, snapshotProperties, replacePartitions, workerPoolSize, sinkId, committerMetrics); + this.commitHook = commitHook; + } + + @Override + public void commit(Collection> commitRequests) + throws IOException, InterruptedException { + commitHook.beforeCommit(); + super.commit(commitRequests); + commitHook.afterCommit(); + } + + @Override + void commitOperation( + Table table, + String branch, + SnapshotUpdate operation, + CommitSummary summary, + String description, + String newFlinkJobId, + String operatorId, + long checkpointId) { + commitHook.duringCommit(); + super.commitOperation( + table, branch, operation, summary, description, newFlinkJobId, operatorId, checkpointId); + } + } + + private void verifyResults(List dynamicData) throws IOException { + // Calculate the expected result + Map, List> expectedData = Maps.newHashMap(); + Map expectedSchema = Maps.newHashMap(); + dynamicData.forEach( + r -> { + Schema oldSchema = expectedSchema.get(r.tableName); + if (oldSchema == null || oldSchema.columns().size() < r.schemaProvided.columns().size()) { + expectedSchema.put(r.tableName, r.schemaExpected); + } + }); + + dynamicData.forEach( + r -> { + List data = + expectedData.computeIfAbsent( + Tuple2.of(r.tableName, r.branch), unused -> Lists.newArrayList()); + data.addAll( + convertToRowData(expectedSchema.get(r.tableName), ImmutableList.of(r.rowExpected))); + }); + + // Check the expected result + int count = dynamicData.size(); + for (Map.Entry, List> e : expectedData.entrySet()) { + SimpleDataUtil.assertTableRows( + CATALOG_EXTENSION + .catalogLoader() + .loadCatalog() + .loadTable(TableIdentifier.of(DATABASE, e.getKey().f0)), + e.getValue(), + e.getKey().f1); + count -= e.getValue().size(); + } + + // Found every record + assertThat(count).isZero(); + } + + private List convertToRowData(Schema schema, List rows) { + DataFormatConverters.RowConverter converter = converter(schema); + return rows.stream() + .map( + r -> { + Row updateRow = r; + // We need conversion to generate the missing columns + if (r.getArity() != schema.columns().size()) { + updateRow = new Row(schema.columns().size()); + for (int i = 0; i < r.getArity(); ++i) { + updateRow.setField(i, r.getField(i)); + } + } + return converter.toInternal(updateRow); + }) + .collect(Collectors.toList()); + } + + private static Row randomRow(Schema schema, long seedOverride) { + return TestHelpers.convertRecordToRow( + RandomGenericData.generate(schema, 1, seedOverride), schema) + .get(0); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSinkPerf.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSinkPerf.java new file mode 100644 index 000000000000..ae5b2f67120b --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSinkPerf.java @@ -0,0 +1,245 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.apache.iceberg.flink.TestFixtures.DATABASE; +import static org.apache.iceberg.flink.TestFixtures.TABLE; +import static org.apache.iceberg.flink.sink.dynamic.DynamicCommitter.MAX_CONTINUOUS_EMPTY_COMMITS; + +import java.util.List; +import java.util.function.Consumer; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import org.apache.flink.api.common.functions.FlatMapFunction; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; +import org.apache.flink.util.Collector; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.RowDataConverter; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.sink.IcebergSink; +import org.apache.iceberg.flink.source.BoundedTestSource; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Performance test class to compare {@link DynamicIcebergSink} against {@link IcebergSink} to + * measure and compare their throughput. + * + *

    The test dynamically generates input for multiple tables, then writes to these tables. For the + * DynamicSink, a single sink is used to write all tables. For the IcebergSink, one sink is used per + * table. The test logs the written record counts and elapsed time based on the Iceberg snapshot + * metadata. + * + *

    Usage

    + * + *
      + *
    • Set the SAMPLE_SIZE, RECORD_SIZE, and TABLE_NUM. + *
    • Run the unit tests and review logs for performance results. + *
    + * + *

    Note: This test is disabled by default and should be enabled manually when performance testing + * is needed. It is not intended as a standard unit test. + */ +@Disabled("Please enable manually for performance testing.") +class TestDynamicIcebergSinkPerf { + private static final Logger LOG = LoggerFactory.getLogger(TestDynamicIcebergSinkPerf.class); + + @RegisterExtension + protected static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(DATABASE, TABLE); + + private static final int SAMPLE_SIZE = 50_000; + private static final int RECORD_SIZE = 5_000_000; + private static final int TABLE_NUM = 3; + private static final int PARALLELISM = 2; + private static final int WRITE_PARALLELISM = 2; + private static final TableIdentifier[] IDENTIFIERS = new TableIdentifier[TABLE_NUM]; + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "name2", Types.StringType.get()), + Types.NestedField.required(3, "name3", Types.StringType.get()), + Types.NestedField.required(4, "name4", Types.StringType.get()), + Types.NestedField.required(5, "name5", Types.StringType.get()), + Types.NestedField.required(6, "name6", Types.StringType.get()), + Types.NestedField.required(7, "name7", Types.StringType.get()), + Types.NestedField.required(8, "name8", Types.StringType.get()), + Types.NestedField.required(9, "name9", Types.StringType.get())); + private static final List RANGE = + IntStream.range(0, RECORD_SIZE).boxed().collect(Collectors.toList()); + + private static List rows; + private StreamExecutionEnvironment env; + + @BeforeEach + void before() { + for (int i = 0; i < TABLE_NUM; ++i) { + // So the table name hash difference is bigger than 1 + IDENTIFIERS[i] = TableIdentifier.of(DATABASE, TABLE + "_" + (i * 13)); + + Table table = + CATALOG_EXTENSION + .catalog() + .createTable( + IDENTIFIERS[i], + SCHEMA, + PartitionSpec.unpartitioned(), + ImmutableMap.of(MAX_CONTINUOUS_EMPTY_COMMITS, "100000")); + + table.manageSnapshots().createBranch("main").commit(); + } + + List records = RandomGenericData.generate(SCHEMA, SAMPLE_SIZE, 1L); + rows = Lists.newArrayListWithCapacity(records.size()); + for (int i = 0; i < records.size(); ++i) { + rows.add( + new DynamicRecord( + IDENTIFIERS[i % TABLE_NUM], + "main", + SCHEMA, + RowDataConverter.convert(SCHEMA, records.get(i)), + PartitionSpec.unpartitioned(), + DistributionMode.NONE, + WRITE_PARALLELISM)); + } + + Configuration configuration = MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; + configuration.setString("rest.flamegraph.enabled", "true"); + env = + StreamExecutionEnvironment.getExecutionEnvironment(configuration) + .enableCheckpointing(100) + .setParallelism(PARALLELISM) + .setMaxParallelism(PARALLELISM); + env.getConfig().enableObjectReuse(); + } + + @AfterEach + void after() { + for (TableIdentifier identifier : IDENTIFIERS) { + CATALOG_EXTENSION.catalog().dropTable(identifier); + } + } + + private static class IdBasedGenerator implements DynamicRecordGenerator { + + @Override + public void generate(Integer id, Collector out) { + out.collect(rows.get(id % SAMPLE_SIZE)); + } + } + + @Test + void testDynamicSink() throws Exception { + // So we make sure that the writer threads are the same for the 2 tests + env.setMaxParallelism(PARALLELISM * TABLE_NUM * 2); + env.setParallelism(PARALLELISM * TABLE_NUM * 2); + runTest( + s -> { + DynamicIcebergSink.forInput(s) + .generator(new IdBasedGenerator()) + .immediateTableUpdate(true) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .append(); + }); + } + + @Test + void testIcebergSink() throws Exception { + runTest( + s -> { + for (int i = 0; i < IDENTIFIERS.length; ++i) { + TableLoader tableLoader = + TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), IDENTIFIERS[i]); + final int finalInt = i; + IcebergSink.forRowData( + s.flatMap( + (FlatMapFunction) + (input, collector) -> { + if (input % TABLE_NUM == finalInt) { + collector.collect(rows.get(input % SAMPLE_SIZE).rowData()); + } + }) + .returns(InternalTypeInfo.of(FlinkSchemaUtil.convert(SCHEMA))) + .rebalance()) + .tableLoader(tableLoader) + .uidSuffix("Uid" + i) + .writeParallelism(WRITE_PARALLELISM) + .append(); + } + }); + } + + private void runTest(Consumer> sink) throws Exception { + DataStream dataStream = + env.addSource( + new BoundedTestSource<>( + ImmutableList.of( + RANGE, RANGE, RANGE, RANGE, RANGE, RANGE, RANGE, RANGE, RANGE, RANGE), + true), + TypeInformation.of(Integer.class)); + + sink.accept(dataStream); + + long before = System.currentTimeMillis(); + env.execute(); + + for (TableIdentifier identifier : IDENTIFIERS) { + Table table = CATALOG_EXTENSION.catalog().loadTable(identifier); + for (Snapshot snapshot : table.snapshots()) { + long records = 0; + for (DataFile dataFile : snapshot.addedDataFiles(table.io())) { + records += dataFile.recordCount(); + } + + LOG.info( + "TEST RESULT: For table {} snapshot {} written {} records in {} ms", + identifier, + snapshot.snapshotId(), + records, + snapshot.timestampMillis() - before); + before = snapshot.timestampMillis(); + } + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchema.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchema.java new file mode 100644 index 000000000000..ab8ce98c3594 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchema.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +/** Test writing DynamicRecord with the full schema */ +class TestDynamicRecordInternalSerializerWriteSchema + extends DynamicRecordInternalSerializerTestBase { + + TestDynamicRecordInternalSerializerWriteSchema() { + super(true); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchemaId.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchemaId.java new file mode 100644 index 000000000000..1d8890546214 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchemaId.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +/** Test writing DynamicRecord with only the schema id. */ +class TestDynamicRecordInternalSerializerWriteSchemaId + extends DynamicRecordInternalSerializerTestBase { + + TestDynamicRecordInternalSerializerWriteSchemaId() { + super(false); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java new file mode 100644 index 000000000000..618074f412f9 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.apache.iceberg.flink.TestFixtures.DATABASE; +import static org.apache.iceberg.flink.TestFixtures.TABLE; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Collections; +import org.apache.flink.table.data.GenericRowData; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; + +class TestDynamicTableUpdateOperator { + + @RegisterExtension + private static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(DATABASE, TABLE); + + private static final Schema SCHEMA1 = + new Schema(Types.NestedField.required(1, "id", Types.IntegerType.get())); + + private static final Schema SCHEMA2 = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); + + @Test + void testDynamicTableUpdateOperatorNewTable() throws Exception { + int cacheMaximumSize = 10; + int cacheRefreshMs = 1000; + int inputSchemaCacheMaximumSize = 10; + Catalog catalog = CATALOG_EXTENSION.catalog(); + TableIdentifier table = TableIdentifier.of(TABLE); + + assertThat(catalog.tableExists(table)).isFalse(); + DynamicTableUpdateOperator operator = + new DynamicTableUpdateOperator( + CATALOG_EXTENSION.catalogLoader(), + cacheMaximumSize, + cacheRefreshMs, + inputSchemaCacheMaximumSize); + operator.open(null); + + DynamicRecordInternal input = + new DynamicRecordInternal( + TABLE, + "branch", + SCHEMA1, + GenericRowData.of(1, "test"), + PartitionSpec.unpartitioned(), + 42, + false, + Collections.emptySet()); + DynamicRecordInternal output = operator.map(input); + + assertThat(catalog.tableExists(table)).isTrue(); + assertThat(input).isEqualTo(output); + } + + @Test + void testDynamicTableUpdateOperatorSchemaChange() throws Exception { + int cacheMaximumSize = 10; + int cacheRefreshMs = 1000; + int inputSchemaCacheMaximumSize = 10; + Catalog catalog = CATALOG_EXTENSION.catalog(); + TableIdentifier table = TableIdentifier.of(TABLE); + + DynamicTableUpdateOperator operator = + new DynamicTableUpdateOperator( + CATALOG_EXTENSION.catalogLoader(), + cacheMaximumSize, + cacheRefreshMs, + inputSchemaCacheMaximumSize); + operator.open(null); + + catalog.createTable(table, SCHEMA1); + DynamicRecordInternal input = + new DynamicRecordInternal( + TABLE, + "branch", + SCHEMA2, + GenericRowData.of(1, "test"), + PartitionSpec.unpartitioned(), + 42, + false, + Collections.emptySet()); + DynamicRecordInternal output = operator.map(input); + + assertThat(catalog.loadTable(table).schema().sameSchema(SCHEMA2)).isTrue(); + assertThat(input).isEqualTo(output); + + // Process the same input again + DynamicRecordInternal output2 = operator.map(input); + assertThat(output2).isEqualTo(output); + assertThat(catalog.loadTable(table).schema().schemaId()).isEqualTo(output.schema().schemaId()); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultAggregator.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultAggregator.java new file mode 100644 index 000000000000..713c67da170a --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultAggregator.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; +import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.hadoop.util.Sets; +import org.apache.iceberg.Schema; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.io.WriteResult; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; + +class TestDynamicWriteResultAggregator { + + @RegisterExtension + static final HadoopCatalogExtension CATALOG_EXTENSION = new HadoopCatalogExtension("db", "table"); + + @Test + void testAggregator() throws Exception { + CATALOG_EXTENSION.catalog().createTable(TableIdentifier.of("table"), new Schema()); + CATALOG_EXTENSION.catalog().createTable(TableIdentifier.of("table2"), new Schema()); + + DynamicWriteResultAggregator aggregator = + new DynamicWriteResultAggregator(CATALOG_EXTENSION.catalogLoader()); + try (OneInputStreamOperatorTestHarness< + CommittableMessage, CommittableMessage> + testHarness = new OneInputStreamOperatorTestHarness<>(aggregator)) { + testHarness.open(); + + WriteTarget writeTarget1 = new WriteTarget("table", "branch", 42, 0, true, Sets.newHashSet()); + DynamicWriteResult dynamicWriteResult1 = + new DynamicWriteResult(writeTarget1, WriteResult.builder().build()); + WriteTarget writeTarget2 = + new WriteTarget("table2", "branch", 42, 0, true, Sets.newHashSet(1, 2)); + DynamicWriteResult dynamicWriteResult2 = + new DynamicWriteResult(writeTarget2, WriteResult.builder().build()); + + CommittableWithLineage committable1 = + new CommittableWithLineage<>(dynamicWriteResult1, 0, 0); + StreamRecord> record1 = + new StreamRecord<>(committable1); + testHarness.processElement(record1); + CommittableWithLineage committable2 = + new CommittableWithLineage<>(dynamicWriteResult2, 0, 0); + StreamRecord> record2 = + new StreamRecord<>(committable2); + testHarness.processElement(record2); + + assertThat(testHarness.getOutput()).isEmpty(); + + testHarness.prepareSnapshotPreBarrier(1L); + // Contains a CommittableSummary + DynamicCommittable + assertThat(testHarness.getRecordOutput()).hasSize(3); + + testHarness.prepareSnapshotPreBarrier(2L); + // Only contains a CommittableSummary + assertThat(testHarness.getRecordOutput()).hasSize(4); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultSerializer.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultSerializer.java new file mode 100644 index 000000000000..a3a9691107eb --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultSerializer.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.nio.ByteBuffer; +import org.apache.hadoop.util.Sets; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.Test; + +class TestDynamicWriteResultSerializer { + + private static final DataFile DATA_FILE = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(0) + .withMetrics( + new Metrics( + 42L, + null, + ImmutableMap.of(1, 5L), + ImmutableMap.of(1, 0L), + null, + ImmutableMap.of(1, ByteBuffer.allocate(1)), + ImmutableMap.of(1, ByteBuffer.allocate(1)))) + .build(); + + @Test + void testRoundtrip() throws IOException { + DynamicWriteResult dynamicWriteResult = + new DynamicWriteResult( + new WriteTarget("table", "branch", 42, 23, false, Sets.newHashSet(1, 2)), + WriteResult.builder().addDataFiles(DATA_FILE).build()); + + DynamicWriteResultSerializer serializer = new DynamicWriteResultSerializer(); + DynamicWriteResult copy = + serializer.deserialize(serializer.getVersion(), serializer.serialize(dynamicWriteResult)); + + assertThat(copy.writeResult().dataFiles()).hasSize(1); + DataFile dataFile = copy.writeResult().dataFiles()[0]; + // DataFile doesn't implement equals, but we can still do basic checks + assertThat(dataFile.path()).isEqualTo("/path/to/data-1.parquet"); + assertThat(dataFile.recordCount()).isEqualTo(42L); + } + + @Test + void testUnsupportedVersion() throws IOException { + DynamicWriteResult dynamicWriteResult = + new DynamicWriteResult( + new WriteTarget("table", "branch", 42, 23, false, Sets.newHashSet(1, 2)), + WriteResult.builder().addDataFiles(DATA_FILE).build()); + + DynamicWriteResultSerializer serializer = new DynamicWriteResultSerializer(); + assertThatThrownBy(() -> serializer.deserialize(-1, serializer.serialize(dynamicWriteResult))) + .hasMessage("Unrecognized version or corrupt state: -1") + .isInstanceOf(IOException.class); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriter.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriter.java new file mode 100644 index 000000000000..42875982a000 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriter.java @@ -0,0 +1,254 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.File; +import java.net.URI; +import java.util.Collection; +import java.util.Map; +import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.common.DynFields; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.sink.FlinkAppenderFactory; +import org.apache.iceberg.flink.sink.TestFlinkIcebergSinkBase; +import org.apache.iceberg.io.BaseTaskWriter; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.jetbrains.annotations.NotNull; +import org.junit.jupiter.api.Test; + +class TestDynamicWriter extends TestFlinkIcebergSinkBase { + + private static final TableIdentifier TABLE1 = TableIdentifier.of("myTable1"); + private static final TableIdentifier TABLE2 = TableIdentifier.of("myTable2"); + + @Test + void testDynamicWriter() throws Exception { + Catalog catalog = CATALOG_EXTENSION.catalog(); + Table table1 = catalog.createTable(TABLE1, SimpleDataUtil.SCHEMA); + Table table2 = catalog.createTable(TABLE2, SimpleDataUtil.SCHEMA); + + DynamicWriter dynamicWriter = createDynamicWriter(catalog); + + DynamicRecordInternal record1 = getDynamicRecordInternal(table1); + DynamicRecordInternal record2 = getDynamicRecordInternal(table2); + + assertThat(getNumDataFiles(table1)).isEqualTo(0); + + dynamicWriter.write(record1, null); + dynamicWriter.write(record2, null); + Collection writeResults = dynamicWriter.prepareCommit(); + + assertThat(writeResults).hasSize(2); + assertThat(getNumDataFiles(table1)).isEqualTo(1); + assertThat( + dynamicWriter + .getMetrics() + .writerMetrics(TABLE1.name()) + .getFlushedDataFiles() + .getCount()) + .isEqualTo(1); + assertThat( + dynamicWriter + .getMetrics() + .writerMetrics(TABLE2.name()) + .getFlushedDataFiles() + .getCount()) + .isEqualTo(1); + + WriteResult wr1 = writeResults.iterator().next().writeResult(); + assertThat(wr1.dataFiles().length).isEqualTo(1); + assertThat(wr1.dataFiles()[0].format()).isEqualTo(FileFormat.PARQUET); + assertThat(wr1.deleteFiles()).isEmpty(); + + dynamicWriter.write(record1, null); + dynamicWriter.write(record2, null); + writeResults = dynamicWriter.prepareCommit(); + + assertThat(writeResults).hasSize(2); + assertThat(getNumDataFiles(table1)).isEqualTo(2); + assertThat( + dynamicWriter + .getMetrics() + .writerMetrics(TABLE1.name()) + .getFlushedDataFiles() + .getCount()) + .isEqualTo(2); + assertThat( + dynamicWriter + .getMetrics() + .writerMetrics(TABLE2.name()) + .getFlushedDataFiles() + .getCount()) + .isEqualTo(2); + + WriteResult wr2 = writeResults.iterator().next().writeResult(); + assertThat(wr2.dataFiles().length).isEqualTo(1); + assertThat(wr2.dataFiles()[0].format()).isEqualTo(FileFormat.PARQUET); + assertThat(wr2.deleteFiles()).isEmpty(); + + dynamicWriter.close(); + } + + @Test + void testDynamicWriterPropertiesDefault() throws Exception { + Catalog catalog = CATALOG_EXTENSION.catalog(); + Table table1 = + catalog.createTable( + TABLE1, + SimpleDataUtil.SCHEMA, + null, + ImmutableMap.of("write.parquet.compression-codec", "zstd")); + + DynamicWriter dynamicWriter = createDynamicWriter(catalog); + DynamicRecordInternal record1 = getDynamicRecordInternal(table1); + + assertThat(getNumDataFiles(table1)).isEqualTo(0); + + dynamicWriter.write(record1, null); + Map properties = properties(dynamicWriter); + assertThat(properties).containsEntry("write.parquet.compression-codec", "zstd"); + + dynamicWriter.close(); + } + + @Test + void testDynamicWriterPropertiesPriority() throws Exception { + Catalog catalog = CATALOG_EXTENSION.catalog(); + Table table1 = + catalog.createTable( + TABLE1, + SimpleDataUtil.SCHEMA, + null, + ImmutableMap.of("write.parquet.compression-codec", "zstd")); + + DynamicWriter dynamicWriter = + createDynamicWriter(catalog, ImmutableMap.of("write.parquet.compression-codec", "gzip")); + DynamicRecordInternal record1 = getDynamicRecordInternal(table1); + + assertThat(getNumDataFiles(table1)).isEqualTo(0); + + dynamicWriter.write(record1, null); + Map properties = properties(dynamicWriter); + assertThat(properties).containsEntry("write.parquet.compression-codec", "gzip"); + + dynamicWriter.close(); + } + + @Test + void testDynamicWriterUpsert() throws Exception { + Catalog catalog = CATALOG_EXTENSION.catalog(); + DynamicWriter dyamicWriter = createDynamicWriter(catalog); + Table table1 = CATALOG_EXTENSION.catalog().createTable(TABLE1, SimpleDataUtil.SCHEMA); + + DynamicRecordInternal record = getDynamicRecordInternal(table1); + record.setUpsertMode(true); + record.setEqualityFieldIds(Sets.newHashSet(1)); + + dyamicWriter.write(record, null); + dyamicWriter.prepareCommit(); + + assertThat( + dyamicWriter + .getMetrics() + .writerMetrics(TABLE1.name()) + .getFlushedDeleteFiles() + .getCount()) + .isEqualTo(1); + assertThat( + dyamicWriter.getMetrics().writerMetrics(TABLE1.name()).getFlushedDataFiles().getCount()) + .isEqualTo(1); + } + + @Test + void testDynamicWriterUpsertNoEqualityFields() { + Catalog catalog = CATALOG_EXTENSION.catalog(); + DynamicWriter dyamicWriter = createDynamicWriter(catalog); + Table table1 = CATALOG_EXTENSION.catalog().createTable(TABLE1, SimpleDataUtil.SCHEMA); + + DynamicRecordInternal record = getDynamicRecordInternal(table1); + record.setUpsertMode(true); + + assertThatThrownBy(() -> dyamicWriter.write(record, null)) + .isInstanceOf(IllegalStateException.class) + .hasMessage( + "Equality field columns shouldn't be empty when configuring to use UPSERT data."); + } + + private static @NotNull DynamicWriter createDynamicWriter( + Catalog catalog, Map properties) { + DynamicWriter dynamicWriter = + new DynamicWriter( + catalog, + FileFormat.PARQUET, + 1024L, + properties, + 100, + new DynamicWriterMetrics(new UnregisteredMetricsGroup()), + 0, + 0); + return dynamicWriter; + } + + private static @NotNull DynamicWriter createDynamicWriter(Catalog catalog) { + return createDynamicWriter(catalog, Map.of()); + } + + private static @NotNull DynamicRecordInternal getDynamicRecordInternal(Table table1) { + DynamicRecordInternal record = new DynamicRecordInternal(); + record.setTableName(TableIdentifier.parse(table1.name()).name()); + record.setSchema(table1.schema()); + record.setSpec(table1.spec()); + record.setRowData(SimpleDataUtil.createRowData(1, "test")); + return record; + } + + private static int getNumDataFiles(Table table) { + File dataDir = new File(URI.create(table.location()).getPath(), "data"); + if (dataDir.exists()) { + return dataDir.listFiles((dir, name) -> !name.startsWith(".")).length; + } + return 0; + } + + private Map properties(DynamicWriter dynamicWriter) { + DynFields.BoundField>> writerField = + DynFields.builder().hiddenImpl(dynamicWriter.getClass(), "writers").build(dynamicWriter); + + DynFields.BoundField appenderField = + DynFields.builder() + .hiddenImpl(BaseTaskWriter.class, "appenderFactory") + .build(writerField.get().values().iterator().next()); + DynFields.BoundField> propsField = + DynFields.builder() + .hiddenImpl(FlinkAppenderFactory.class, "props") + .build(appenderField.get()); + return propsField.get(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestEvolveSchemaVisitor.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestEvolveSchemaVisitor.java new file mode 100644 index 000000000000..d416e7ec1fc6 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestEvolveSchemaVisitor.java @@ -0,0 +1,626 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.apache.iceberg.types.Types.NestedField.of; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.lang.reflect.Constructor; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.iceberg.Schema; +import org.apache.iceberg.UpdateSchema; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Type.PrimitiveType; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.types.Types.DecimalType; +import org.apache.iceberg.types.Types.DoubleType; +import org.apache.iceberg.types.Types.FloatType; +import org.apache.iceberg.types.Types.IntegerType; +import org.apache.iceberg.types.Types.ListType; +import org.apache.iceberg.types.Types.LongType; +import org.apache.iceberg.types.Types.MapType; +import org.apache.iceberg.types.Types.StringType; +import org.apache.iceberg.types.Types.StructType; +import org.apache.iceberg.types.Types.TimeType; +import org.apache.iceberg.types.Types.UUIDType; +import org.junit.jupiter.api.Test; + +public class TestEvolveSchemaVisitor { + + private static List primitiveTypes() { + return Lists.newArrayList( + StringType.get(), + TimeType.get(), + Types.TimestampType.withoutZone(), + Types.TimestampType.withZone(), + UUIDType.get(), + Types.DateType.get(), + Types.BooleanType.get(), + Types.BinaryType.get(), + DoubleType.get(), + IntegerType.get(), + Types.FixedType.ofLength(10), + DecimalType.of(10, 2), + LongType.get(), + FloatType.get()); + } + + private static Types.NestedField[] primitiveFields( + Integer initialValue, List primitiveTypes) { + return primitiveFields(initialValue, primitiveTypes, true); + } + + private static Types.NestedField[] primitiveFields( + Integer initialValue, List primitiveTypes, boolean optional) { + AtomicInteger atomicInteger = new AtomicInteger(initialValue); + return primitiveTypes.stream() + .map( + type -> + of( + atomicInteger.incrementAndGet(), + optional, + type.toString(), + Types.fromPrimitiveString(type.toString()))) + .toArray(Types.NestedField[]::new); + } + + @Test + public void testAddTopLevelPrimitives() { + Schema targetSchema = new Schema(primitiveFields(0, primitiveTypes())); + UpdateSchema updateApi = loadUpdateApi(new Schema()); + EvolveSchemaVisitor.visit(updateApi, new Schema(), targetSchema); + assertThat(targetSchema.asStruct()).isEqualTo(updateApi.apply().asStruct()); + } + + @Test + public void testMakeTopLevelPrimitivesOptional() { + Schema existingSchema = new Schema(primitiveFields(0, primitiveTypes(), false)); + assertThat(existingSchema.columns().stream().allMatch(Types.NestedField::isRequired)).isTrue(); + + UpdateSchema updateApi = loadUpdateApi(existingSchema); + EvolveSchemaVisitor.visit(updateApi, existingSchema, new Schema()); + Schema newSchema = updateApi.apply(); + assertThat(newSchema.asStruct().fields()).hasSize(14); + assertThat(newSchema.columns().stream().allMatch(Types.NestedField::isOptional)).isTrue(); + } + + @Test + public void testIdentifyFieldsByName() { + Schema existingSchema = + new Schema(Types.NestedField.optional(42, "myField", Types.LongType.get())); + UpdateSchema updateApi = loadUpdateApi(existingSchema); + Schema newSchema = + new Schema(Arrays.asList(Types.NestedField.optional(-1, "myField", Types.LongType.get()))); + EvolveSchemaVisitor.visit(updateApi, existingSchema, newSchema); + assertThat(updateApi.apply().sameSchema(existingSchema)).isTrue(); + } + + @Test + public void testChangeOrderTopLevelPrimitives() { + Schema existingSchema = + new Schema( + Arrays.asList(optional(1, "a", StringType.get()), optional(2, "b", StringType.get()))); + Schema targetSchema = + new Schema( + Arrays.asList(optional(2, "b", StringType.get()), optional(1, "a", StringType.get()))); + UpdateSchema updateApi = loadUpdateApi(existingSchema); + EvolveSchemaVisitor.visit(updateApi, existingSchema, targetSchema); + assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); + } + + @Test + public void testAddTopLevelListOfPrimitives() { + for (PrimitiveType primitiveType : primitiveTypes()) { + Schema targetSchema = new Schema(optional(1, "aList", ListType.ofOptional(2, primitiveType))); + UpdateSchema updateApi = loadUpdateApi(new Schema()); + EvolveSchemaVisitor.visit(updateApi, new Schema(), targetSchema); + assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); + } + } + + @Test + public void testMakeTopLevelListOfPrimitivesOptional() { + for (PrimitiveType primitiveType : primitiveTypes()) { + Schema existingSchema = + new Schema(optional(1, "aList", ListType.ofRequired(2, primitiveType))); + Schema targetSchema = new Schema(); + UpdateSchema updateApi = loadUpdateApi(existingSchema); + EvolveSchemaVisitor.visit(updateApi, existingSchema, targetSchema); + Schema expectedSchema = + new Schema(optional(1, "aList", ListType.ofRequired(2, primitiveType))); + assertThat(updateApi.apply().asStruct()).isEqualTo(expectedSchema.asStruct()); + } + } + + @Test + public void testAddTopLevelMapOfPrimitives() { + for (PrimitiveType primitiveType : primitiveTypes()) { + Schema targetSchema = + new Schema(optional(1, "aMap", MapType.ofOptional(2, 3, primitiveType, primitiveType))); + UpdateSchema updateApi = loadUpdateApi(new Schema()); + EvolveSchemaVisitor.visit(updateApi, new Schema(), targetSchema); + assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); + } + } + + @Test + public void testAddTopLevelStructOfPrimitives() { + for (PrimitiveType primitiveType : primitiveTypes()) { + Schema currentSchema = + new Schema( + optional(1, "aStruct", StructType.of(optional(2, "primitive", primitiveType)))); + UpdateSchema updateApi = loadUpdateApi(new Schema()); + EvolveSchemaVisitor.visit(updateApi, new Schema(), currentSchema); + assertThat(updateApi.apply().asStruct()).isEqualTo(currentSchema.asStruct()); + } + } + + @Test + public void testAddNestedPrimitive() { + for (PrimitiveType primitiveType : primitiveTypes()) { + Schema currentSchema = new Schema(optional(1, "aStruct", StructType.of())); + Schema targetSchema = + new Schema( + optional(1, "aStruct", StructType.of(optional(2, "primitive", primitiveType)))); + UpdateSchema updateApi = loadUpdateApi(currentSchema); + EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); + assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); + } + } + + @Test + public void testMakeNestedPrimitiveOptional() { + for (PrimitiveType primitiveType : primitiveTypes()) { + Schema currentSchema = + new Schema( + optional(1, "aStruct", StructType.of(required(2, "primitive", primitiveType)))); + Schema targetSchema = + new Schema( + optional(1, "aStruct", StructType.of(optional(2, "primitive", primitiveType)))); + UpdateSchema updateApi = loadUpdateApi(currentSchema); + EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); + assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); + } + } + + @Test + public void testAddNestedPrimitives() { + Schema currentSchema = new Schema(optional(1, "aStruct", StructType.of())); + Schema targetSchema = + new Schema(optional(1, "aStruct", StructType.of(primitiveFields(1, primitiveTypes())))); + UpdateSchema updateApi = loadUpdateApi(currentSchema); + EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); + assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); + } + + @Test + public void testAddNestedLists() { + Schema targetSchema = + new Schema( + optional( + 1, + "aList", + ListType.ofOptional( + 2, + ListType.ofOptional( + 3, + ListType.ofOptional( + 4, + ListType.ofOptional( + 5, + ListType.ofOptional( + 6, + ListType.ofOptional( + 7, + ListType.ofOptional( + 8, + ListType.ofOptional( + 9, + ListType.ofOptional( + 10, DecimalType.of(11, 20)))))))))))); + UpdateSchema updateApi = loadUpdateApi(new Schema()); + EvolveSchemaVisitor.visit(updateApi, new Schema(), targetSchema); + assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); + } + + @Test + public void testAddNestedStruct() { + Schema currentSchema = + new Schema(optional(1, "struct1", StructType.of(optional(2, "struct2", StructType.of())))); + Schema targetSchema = + new Schema( + optional( + 1, + "struct1", + StructType.of( + optional( + 2, + "struct2", + StructType.of( + optional( + 3, + "struct3", + StructType.of( + optional( + 4, + "struct4", + StructType.of( + optional( + 5, + "struct5", + StructType.of( + optional( + 6, + "struct6", + StructType.of( + optional( + 7, + "aString", + StringType.get())))))))))))))); + UpdateSchema updateApi = loadUpdateApi(currentSchema); + EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); + assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); + } + + @Test + public void testAddNestedMaps() { + Schema targetSchema = + new Schema( + optional( + 1, + "struct", + MapType.ofOptional( + 2, + 3, + StringType.get(), + MapType.ofOptional( + 4, + 5, + StringType.get(), + MapType.ofOptional( + 6, + 7, + StringType.get(), + MapType.ofOptional( + 8, + 9, + StringType.get(), + MapType.ofOptional( + 10, + 11, + StringType.get(), + MapType.ofOptional( + 12, 13, StringType.get(), StringType.get())))))))); + + UpdateSchema updateApi = loadUpdateApi(new Schema()); + EvolveSchemaVisitor.visit(updateApi, new Schema(), targetSchema); + assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); + } + + @Test + public void testDetectInvalidTopLevelList() { + Schema currentSchema = + new Schema(optional(1, "aList", ListType.ofOptional(2, StringType.get()))); + Schema targetSchema = new Schema(optional(1, "aList", ListType.ofOptional(2, LongType.get()))); + assertThatThrownBy( + () -> + EvolveSchemaVisitor.visit( + loadUpdateApi(currentSchema), currentSchema, targetSchema)) + .hasMessage("Cannot change column type: aList.element: string -> long") + .isInstanceOf(IllegalArgumentException.class); + } + + @Test + public void testDetectInvalidTopLevelMapValue() { + + Schema currentSchema = + new Schema( + optional(1, "aMap", MapType.ofOptional(2, 3, StringType.get(), StringType.get()))); + Schema targetSchema = + new Schema(optional(1, "aMap", MapType.ofOptional(2, 3, StringType.get(), LongType.get()))); + + assertThatThrownBy( + () -> + EvolveSchemaVisitor.visit( + loadUpdateApi(currentSchema), currentSchema, targetSchema)) + .hasMessage("Cannot change column type: aMap.value: string -> long") + .isInstanceOf(IllegalArgumentException.class); + } + + @Test + public void testDetectInvalidTopLevelMapKey() { + Schema currentSchema = + new Schema( + optional(1, "aMap", MapType.ofOptional(2, 3, StringType.get(), StringType.get()))); + Schema targetSchema = + new Schema(optional(1, "aMap", MapType.ofOptional(2, 3, UUIDType.get(), StringType.get()))); + assertThatThrownBy( + () -> + EvolveSchemaVisitor.visit( + loadUpdateApi(currentSchema), currentSchema, targetSchema)) + .hasMessage("Cannot change column type: aMap.key: string -> uuid") + .isInstanceOf(IllegalArgumentException.class); + } + + @Test + // int 32-bit signed integers -> Can promote to long + public void testTypePromoteIntegerToLong() { + Schema currentSchema = new Schema(required(1, "aCol", IntegerType.get())); + Schema targetSchema = new Schema(required(1, "aCol", LongType.get())); + + UpdateSchema updateApi = loadUpdateApi(currentSchema); + EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); + Schema applied = updateApi.apply(); + assertThat(applied.asStruct().fields()).hasSize(1); + assertThat(applied.asStruct().fields().get(0).type()).isEqualTo(LongType.get()); + } + + @Test + // float 32-bit IEEE 754 floating point -> Can promote to double + public void testTypePromoteFloatToDouble() { + Schema currentSchema = new Schema(required(1, "aCol", FloatType.get())); + Schema targetSchema = new Schema(required(1, "aCol", DoubleType.get())); + + UpdateSchema updateApi = loadUpdateApi(currentSchema); + EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); + Schema applied = updateApi.apply(); + assertThat(applied.asStruct().fields()).hasSize(1); + assertThat(applied.asStruct().fields().get(0).type()).isEqualTo(DoubleType.get()); + } + + @Test + public void testInvalidTypePromoteDoubleToFloat() { + Schema currentSchema = new Schema(required(1, "aCol", DoubleType.get())); + Schema targetSchema = new Schema(required(1, "aCol", FloatType.get())); + assertThatThrownBy( + () -> + EvolveSchemaVisitor.visit( + loadUpdateApi(currentSchema), currentSchema, targetSchema)) + .hasMessage("Cannot change column type: aCol: double -> float") + .isInstanceOf(IllegalArgumentException.class); + } + + @Test + // decimal(P,S) Fixed-point decimal; precision P, scale S -> Scale is fixed [1], precision must be + // 38 or less + public void testTypePromoteDecimalToFixedScaleWithWiderPrecision() { + Schema currentSchema = new Schema(required(1, "aCol", DecimalType.of(20, 1))); + Schema targetSchema = new Schema(required(1, "aCol", DecimalType.of(22, 1))); + + UpdateSchema updateApi = loadUpdateApi(currentSchema); + EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); + assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); + } + + @Test + public void testAddPrimitiveToNestedStruct() { + Schema existingSchema = + new Schema( + required( + 1, + "struct1", + StructType.of( + optional( + 2, + "struct2", + StructType.of( + optional( + 3, + "list", + ListType.ofOptional( + 4, + StructType.of(optional(5, "number", IntegerType.get()))))))))); + + Schema targetSchema = + new Schema( + required( + 1, + "struct1", + StructType.of( + optional( + 2, + "struct2", + StructType.of( + optional( + 3, + "list", + ListType.ofOptional( + 4, + StructType.of( + optional(5, "number", LongType.get()), + optional(6, "time", TimeType.get()))))))))); + + UpdateSchema updateApi = loadUpdateApi(existingSchema); + EvolveSchemaVisitor.visit(updateApi, existingSchema, targetSchema); + assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); + } + + @Test + public void testReplaceListWithPrimitive() { + Schema currentSchema = + new Schema(optional(1, "aColumn", ListType.ofOptional(2, StringType.get()))); + Schema targetSchema = new Schema(optional(1, "aColumn", StringType.get())); + assertThatThrownBy( + () -> + EvolveSchemaVisitor.visit( + loadUpdateApi(currentSchema), currentSchema, targetSchema)) + .hasMessage("Cannot change column type: aColumn: list -> string") + .isInstanceOf(IllegalArgumentException.class); + } + + @Test + public void addNewTopLevelStruct() { + Schema currentSchema = + new Schema( + optional( + 1, + "map1", + MapType.ofOptional( + 2, + 3, + StringType.get(), + ListType.ofOptional( + 4, StructType.of(optional(5, "string1", StringType.get())))))); + + Schema targetSchema = + new Schema( + optional( + 1, + "map1", + MapType.ofOptional( + 2, + 3, + StringType.get(), + ListType.ofOptional( + 4, StructType.of(optional(5, "string1", StringType.get()))))), + optional( + 6, + "struct1", + StructType.of( + optional(7, "d1", StructType.of(optional(8, "d2", StringType.get())))))); + + UpdateSchema updateApi = loadUpdateApi(currentSchema); + EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); + assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); + } + + @Test + public void testAppendNestedStruct() { + Schema currentSchema = + new Schema( + required( + 1, + "s1", + StructType.of( + optional( + 2, + "s2", + StructType.of( + optional( + 3, "s3", StructType.of(optional(4, "s4", StringType.get())))))))); + + Schema targetSchema = + new Schema( + required( + 1, + "s1", + StructType.of( + optional( + 2, + "s2", + StructType.of( + optional(3, "s3", StructType.of(optional(4, "s4", StringType.get()))), + optional( + 5, + "repeat", + StructType.of( + optional( + 6, + "s1", + StructType.of( + optional( + 7, + "s2", + StructType.of( + optional( + 8, + "s3", + StructType.of( + optional( + 9, + "s4", + StringType.get())))))))))))))); + + UpdateSchema updateApi = loadUpdateApi(currentSchema); + EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); + assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); + } + + @Test + public void testMakeNestedStructOptional() { + Schema currentSchema = getNestedSchemaWithOptionalModifier(false); + Schema targetSchema = + new Schema( + required( + 1, + "s1", + StructType.of( + optional( + 2, + "s2", + StructType.of( + optional( + 3, "s3", StructType.of(optional(4, "s4", StringType.get())))))))); + UpdateSchema updateApi = loadUpdateApi(currentSchema); + EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); + assertThat(getNestedSchemaWithOptionalModifier(true).asStruct()) + .isEqualTo(updateApi.apply().asStruct()); + } + + private static Schema getNestedSchemaWithOptionalModifier(boolean nestedIsOptional) { + return new Schema( + required( + 1, + "s1", + StructType.of( + optional( + 2, + "s2", + StructType.of( + optional(3, "s3", StructType.of(optional(4, "s4", StringType.get()))), + of( + 5, + nestedIsOptional, + "repeat", + StructType.of( + optional( + 6, + "s1", + StructType.of( + optional( + 7, + "s2", + StructType.of( + optional( + 8, + "s3", + StructType.of( + optional( + 9, "s4", StringType.get())))))))))))))); + } + + private static UpdateSchema loadUpdateApi(Schema schema) { + try { + Constructor constructor = + TestEvolveSchemaVisitor.class + .getClassLoader() + .loadClass("org.apache.iceberg.SchemaUpdate") + .getDeclaredConstructor(Schema.class, int.class); + constructor.setAccessible(true); + return (UpdateSchema) constructor.newInstance(schema, schema.highestFieldId()); + } catch (Exception e) { + throw new RuntimeException("Failed to instantiate SchemaUpdate class", e); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java new file mode 100644 index 000000000000..8d559e920620 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java @@ -0,0 +1,350 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Collections; +import java.util.Map; +import java.util.Set; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.runtime.state.KeyGroupRangeAssignment; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +class TestHashKeyGenerator { + + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + + private static final String BRANCH = "main"; + private static final TableIdentifier TABLE_IDENTIFIER = TableIdentifier.of("default", "table"); + + @Test + void testRoundRobinWithDistributionModeNone() throws Exception { + int writeParallelism = 10; + int maxWriteParallelism = 2; + HashKeyGenerator generator = new HashKeyGenerator(1, maxWriteParallelism); + PartitionSpec spec = PartitionSpec.unpartitioned(); + + GenericRowData row = GenericRowData.of(1, StringData.fromString("z")); + int writeKey1 = + getWriteKey( + generator, spec, DistributionMode.NONE, writeParallelism, Collections.emptySet(), row); + int writeKey2 = + getWriteKey( + generator, spec, DistributionMode.NONE, writeParallelism, Collections.emptySet(), row); + int writeKey3 = + getWriteKey( + generator, spec, DistributionMode.NONE, writeParallelism, Collections.emptySet(), row); + int writeKey4 = + getWriteKey( + generator, spec, DistributionMode.NONE, writeParallelism, Collections.emptySet(), row); + + assertThat(writeKey1).isNotEqualTo(writeKey2); + assertThat(writeKey3).isEqualTo(writeKey1); + assertThat(writeKey4).isEqualTo(writeKey2); + + assertThat(getSubTaskId(writeKey1, writeParallelism, maxWriteParallelism)).isEqualTo(0); + assertThat(getSubTaskId(writeKey2, writeParallelism, maxWriteParallelism)).isEqualTo(5); + assertThat(getSubTaskId(writeKey3, writeParallelism, maxWriteParallelism)).isEqualTo(0); + assertThat(getSubTaskId(writeKey4, writeParallelism, maxWriteParallelism)).isEqualTo(5); + } + + @Test + void testBucketingWithDistributionModeHash() throws Exception { + int writeParallelism = 3; + int maxWriteParallelism = 8; + HashKeyGenerator generator = new HashKeyGenerator(1, maxWriteParallelism); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); + + GenericRowData row1 = GenericRowData.of(1, StringData.fromString("a")); + GenericRowData row2 = GenericRowData.of(1, StringData.fromString("b")); + GenericRowData row3 = GenericRowData.of(2, StringData.fromString("c")); + GenericRowData row4 = GenericRowData.of(2, StringData.fromString("d")); + + int writeKey1 = + getWriteKey( + generator, spec, DistributionMode.HASH, writeParallelism, Collections.emptySet(), row1); + int writeKey2 = + getWriteKey( + generator, spec, DistributionMode.HASH, writeParallelism, Collections.emptySet(), row2); + int writeKey3 = + getWriteKey( + generator, spec, DistributionMode.HASH, writeParallelism, Collections.emptySet(), row3); + int writeKey4 = + getWriteKey( + generator, spec, DistributionMode.HASH, writeParallelism, Collections.emptySet(), row4); + + assertThat(writeKey1).isEqualTo(writeKey2); + assertThat(writeKey3).isNotEqualTo(writeKey1); + assertThat(writeKey4).isEqualTo(writeKey3); + + assertThat(getSubTaskId(writeKey1, writeParallelism, maxWriteParallelism)).isEqualTo(0); + assertThat(getSubTaskId(writeKey2, writeParallelism, maxWriteParallelism)).isEqualTo(0); + assertThat(getSubTaskId(writeKey3, writeParallelism, maxWriteParallelism)).isEqualTo(1); + assertThat(getSubTaskId(writeKey4, writeParallelism, maxWriteParallelism)).isEqualTo(1); + } + + @Test + void testEqualityKeys() throws Exception { + int writeParallelism = 2; + int maxWriteParallelism = 8; + HashKeyGenerator generator = new HashKeyGenerator(16, maxWriteParallelism); + PartitionSpec unpartitioned = PartitionSpec.unpartitioned(); + + GenericRowData row1 = GenericRowData.of(1, StringData.fromString("foo")); + GenericRowData row2 = GenericRowData.of(1, StringData.fromString("bar")); + GenericRowData row3 = GenericRowData.of(2, StringData.fromString("baz")); + Set equalityColumns = Collections.singleton("id"); + + int writeKey1 = + getWriteKey( + generator, + unpartitioned, + DistributionMode.NONE, + writeParallelism, + equalityColumns, + row1); + int writeKey2 = + getWriteKey( + generator, + unpartitioned, + DistributionMode.NONE, + writeParallelism, + equalityColumns, + row2); + int writeKey3 = + getWriteKey( + generator, + unpartitioned, + DistributionMode.NONE, + writeParallelism, + equalityColumns, + row3); + + assertThat(writeKey1).isEqualTo(writeKey2); + assertThat(writeKey2).isNotEqualTo(writeKey3); + + assertThat(getSubTaskId(writeKey1, writeParallelism, maxWriteParallelism)).isEqualTo(1); + assertThat(getSubTaskId(writeKey2, writeParallelism, maxWriteParallelism)).isEqualTo(1); + assertThat(getSubTaskId(writeKey3, writeParallelism, maxWriteParallelism)).isEqualTo(0); + } + + @Test + void testCapAtMaxWriteParallelism() throws Exception { + int writeParallelism = 10; + int maxWriteParallelism = 5; + HashKeyGenerator generator = new HashKeyGenerator(16, maxWriteParallelism); + PartitionSpec unpartitioned = PartitionSpec.unpartitioned(); + + Set writeKeys = Sets.newHashSet(); + for (int i = 0; i < 20; i++) { + GenericRowData row = GenericRowData.of(i, StringData.fromString("z")); + writeKeys.add( + getWriteKey( + generator, + unpartitioned, + DistributionMode.NONE, + writeParallelism, + Collections.emptySet(), + row)); + } + + assertThat(writeKeys).hasSize(maxWriteParallelism); + assertThat( + writeKeys.stream() + .map(key -> getSubTaskId(key, writeParallelism, writeParallelism)) + .distinct() + .count()) + .isEqualTo(maxWriteParallelism); + } + + @Test + void testHashModeWithoutEqualityFieldsFallsBackToNone() throws Exception { + int writeParallelism = 2; + int maxWriteParallelism = 8; + HashKeyGenerator generator = new HashKeyGenerator(16, maxWriteParallelism); + Schema noIdSchema = new Schema(Types.NestedField.required(1, "x", Types.StringType.get())); + PartitionSpec unpartitioned = PartitionSpec.unpartitioned(); + + DynamicRecord record = + new DynamicRecord( + TABLE_IDENTIFIER, + BRANCH, + noIdSchema, + GenericRowData.of(StringData.fromString("v")), + unpartitioned, + DistributionMode.HASH, + writeParallelism); + + int writeKey1 = generator.generateKey(record); + int writeKey2 = generator.generateKey(record); + int writeKey3 = generator.generateKey(record); + assertThat(writeKey1).isNotEqualTo(writeKey2); + assertThat(writeKey3).isEqualTo(writeKey1); + + assertThat(getSubTaskId(writeKey1, writeParallelism, maxWriteParallelism)).isEqualTo(1); + assertThat(getSubTaskId(writeKey2, writeParallelism, maxWriteParallelism)).isEqualTo(0); + assertThat(getSubTaskId(writeKey3, writeParallelism, maxWriteParallelism)).isEqualTo(1); + } + + @Test + void testSchemaSpecOverrides() throws Exception { + int maxCacheSize = 10; + int writeParallelism = 5; + int maxWriteParallelism = 10; + HashKeyGenerator generator = new HashKeyGenerator(maxCacheSize, maxWriteParallelism); + + DynamicRecord record = + new DynamicRecord( + TABLE_IDENTIFIER, + BRANCH, + SCHEMA, + GenericRowData.of(1, StringData.fromString("foo")), + PartitionSpec.unpartitioned(), + DistributionMode.NONE, + writeParallelism); + + int writeKey1 = generator.generateKey(record); + int writeKey2 = generator.generateKey(record); + // Assert that we are bucketing via NONE (round-robin) + assertThat(writeKey1).isNotEqualTo(writeKey2); + + // Schema has different id + Schema overrideSchema = new Schema(42, SCHEMA.columns()); + // Spec has different id + PartitionSpec overrideSpec = PartitionSpec.builderFor(SCHEMA).withSpecId(42).build(); + RowData overrideData = GenericRowData.of(1L, StringData.fromString("foo")); + + // We get a new key selector for the schema which starts off on the same offset + assertThat(generator.generateKey(record, overrideSchema, null, null)).isEqualTo(writeKey1); + // We get a new key selector for the spec which starts off on the same offset + assertThat(generator.generateKey(record, null, overrideSpec, null)).isEqualTo(writeKey1); + // We get the same key selector which yields a different result for the overridden data + assertThat(generator.generateKey(record, null, null, overrideData)).isNotEqualTo(writeKey1); + } + + @Test + void testMultipleTables() throws Exception { + int maxCacheSize = 10; + int writeParallelism = 2; + int maxWriteParallelism = 8; + HashKeyGenerator generator = new HashKeyGenerator(maxCacheSize, maxWriteParallelism); + + PartitionSpec unpartitioned = PartitionSpec.unpartitioned(); + + GenericRowData rowData = GenericRowData.of(1, StringData.fromString("foo")); + + DynamicRecord record1 = + new DynamicRecord( + TableIdentifier.of("a", "table"), + BRANCH, + SCHEMA, + rowData, + unpartitioned, + DistributionMode.HASH, + writeParallelism); + record1.setEqualityFields(Collections.singleton("id")); + DynamicRecord record2 = + new DynamicRecord( + TableIdentifier.of("my", "other", "table"), + BRANCH, + SCHEMA, + rowData, + unpartitioned, + DistributionMode.HASH, + writeParallelism); + record2.setEqualityFields(Collections.singleton("id")); + + // Consistent hashing for the same record due to HASH distribution mode + int writeKeyRecord1 = generator.generateKey(record1); + assertThat(writeKeyRecord1).isEqualTo(generator.generateKey(record1)); + int writeKeyRecord2 = generator.generateKey(record2); + assertThat(writeKeyRecord2).isEqualTo(generator.generateKey(record2)); + + // But the write keys are for different tables and should not be equal + assertThat(writeKeyRecord1).isNotEqualTo(writeKeyRecord2); + + assertThat(getSubTaskId(writeKeyRecord1, writeParallelism, maxWriteParallelism)).isEqualTo(1); + assertThat(getSubTaskId(writeKeyRecord2, writeParallelism, maxWriteParallelism)).isEqualTo(0); + } + + @Test + void testCaching() throws Exception { + int maxCacheSize = 1; + int writeParallelism = 2; + int maxWriteParallelism = 8; + HashKeyGenerator generator = new HashKeyGenerator(maxCacheSize, maxWriteParallelism); + Map> keySelectorCache = + generator.getKeySelectorCache(); + + PartitionSpec unpartitioned = PartitionSpec.unpartitioned(); + DynamicRecord record = + new DynamicRecord( + TABLE_IDENTIFIER, + BRANCH, + SCHEMA, + GenericRowData.of(1, StringData.fromString("foo")), + unpartitioned, + DistributionMode.NONE, + writeParallelism); + + int writeKey1 = generator.generateKey(record); + assertThat(keySelectorCache).hasSize(1); + + int writeKey2 = generator.generateKey(record); + assertThat(writeKey2).isNotEqualTo(writeKey1); + assertThat(keySelectorCache).hasSize(1); + + int writeKey3 = generator.generateKey(record); + assertThat(keySelectorCache).hasSize(1); + // We create a new key selector which will start off at the same position + assertThat(writeKey1).isEqualTo(writeKey3); + } + + private static int getWriteKey( + HashKeyGenerator generator, + PartitionSpec spec, + DistributionMode mode, + int writeParallelism, + Set equalityFields, + GenericRowData row) + throws Exception { + DynamicRecord record = + new DynamicRecord(TABLE_IDENTIFIER, BRANCH, SCHEMA, row, spec, mode, writeParallelism); + record.setEqualityFields(equalityFields); + return generator.generateKey(record); + } + + private static int getSubTaskId(int writeKey1, int writeParallelism, int maxWriteParallelism) { + return KeyGroupRangeAssignment.assignKeyToParallelOperator( + writeKey1, maxWriteParallelism, writeParallelism); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestLRUCache.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestLRUCache.java new file mode 100644 index 000000000000..679d3de978a3 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestLRUCache.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.Map; +import java.util.function.Consumer; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.Test; + +class TestLRUCache { + private static final Consumer> NO_OP_CALLBACK = ignored -> {}; + + @Test + void testPut() { + LRUCache cache = new LRUCache<>(1, NO_OP_CALLBACK); + cache.put(1, 1); + + assertThat(cache).hasSize(1).containsEntry(1, 1); + } + + @Test + void testGet() { + LRUCache cache = new LRUCache<>(1, NO_OP_CALLBACK); + cache.put(1, 123); + + assertThat(cache).hasSize(1); + assertThat(cache.get(1)).isEqualTo(123); + } + + @Test + void testElementEviction() { + int maxSize = 2; + LRUCache cache = new LRUCache<>(maxSize, NO_OP_CALLBACK); + + cache.put(1, 1); + cache.put(2, 2); + Integer value = cache.get(1); + assertThat(value).isEqualTo(1); + + cache.put(3, 3); // "2" should be evicted + + assertThat(cache).hasSize(2).containsEntry(1, 1).containsEntry(3, 3); + } + + @Test + void testEvictionCallback() { + int maxSize = 2; + TestEvictionCallback callback = new TestEvictionCallback(); + LRUCache cache = new LRUCache<>(maxSize, callback); + + cache.put(1, 1); + cache.put(2, 2); + Integer value = cache.get(1); + assertThat(value).isEqualTo(1); + + cache.put(3, 3); // "2" should be evicted + + assertThat(callback.evictedEntries).containsExactly(Map.entry(2, 2)); + } + + private static class TestEvictionCallback implements Consumer> { + private final List> evictedEntries = Lists.newArrayList(); + + @Override + public void accept(Map.Entry entry) { + evictedEntries.add(entry); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestPartitionSpecEvolution.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestPartitionSpecEvolution.java new file mode 100644 index 000000000000..3e7025de6f91 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestPartitionSpecEvolution.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestPartitionSpecEvolution { + + @Test + void testCompatible() { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.IntegerType.get()), + Types.NestedField.required(1, "data", Types.StringType.get())); + + PartitionSpec spec1 = PartitionSpec.builderFor(schema).bucket("id", 10).build(); + PartitionSpec spec2 = PartitionSpec.builderFor(schema).bucket("id", 10).build(); + + // Happy case, source ids and names match + assertThat(PartitionSpecEvolution.checkCompatibility(spec1, spec2)).isTrue(); + } + + @Test + void testNotCompatibleDifferentTransform() { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.IntegerType.get()), + Types.NestedField.required(1, "data", Types.StringType.get())); + + PartitionSpec spec1 = PartitionSpec.builderFor(schema).bucket("id", 10).build(); + // Same spec als spec1 but different number of buckets + PartitionSpec spec2 = PartitionSpec.builderFor(schema).bucket("id", 23).build(); + + assertThat(PartitionSpecEvolution.checkCompatibility(spec1, spec2)).isFalse(); + } + + @Test + void testNotCompatibleMoreFields() { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.IntegerType.get()), + Types.NestedField.required(1, "data", Types.StringType.get())); + + PartitionSpec spec1 = PartitionSpec.builderFor(schema).bucket("id", 10).build(); + // Additional field + PartitionSpec spec2 = + PartitionSpec.builderFor(schema).bucket("id", 10).truncate("data", 1).build(); + + assertThat(PartitionSpecEvolution.checkCompatibility(spec1, spec2)).isFalse(); + } + + @Test + void testCompatibleWithNonMatchingSourceIds() { + Schema schema1 = + new Schema( + // Use zero-based field ids + Types.NestedField.required(0, "id", Types.IntegerType.get()), + Types.NestedField.required(1, "data", Types.StringType.get())); + + PartitionSpec spec1 = PartitionSpec.builderFor(schema1).bucket("id", 10).build(); + + Schema schema2 = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + + // Same spec als spec1 but bound to a different schema + PartitionSpec spec2 = PartitionSpec.builderFor(schema2).bucket("id", 10).build(); + + // Compatible because the source names match + assertThat(PartitionSpecEvolution.checkCompatibility(spec1, spec2)).isTrue(); + } + + @Test + void testPartitionSpecEvolution() { + Schema schema1 = + new Schema( + Types.NestedField.required(0, "id", Types.IntegerType.get()), + Types.NestedField.required(1, "data", Types.StringType.get())); + + PartitionSpec spec1 = PartitionSpec.builderFor(schema1).bucket("id", 10).build(); + + Schema schema2 = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + + // Change num buckets + PartitionSpec spec2 = PartitionSpec.builderFor(schema2).bucket("id", 23).build(); + + assertThat(PartitionSpecEvolution.checkCompatibility(spec1, spec2)).isFalse(); + PartitionSpecEvolution.PartitionSpecChanges result = + PartitionSpecEvolution.evolve(spec1, spec2); + + assertThat(result.termsToAdd().toString()).isEqualTo("[bucket[23](ref(name=\"id\"))]"); + assertThat(result.termsToRemove().toString()).isEqualTo("[bucket[10](ref(name=\"id\"))]"); + } + + @Test + void testPartitionSpecEvolutionAddField() { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.IntegerType.get()), + Types.NestedField.required(1, "data", Types.StringType.get())); + + PartitionSpec spec1 = PartitionSpec.builderFor(schema).build(); + // Add field + PartitionSpec spec2 = PartitionSpec.builderFor(schema).bucket("id", 23).build(); + + assertThat(PartitionSpecEvolution.checkCompatibility(spec1, spec2)).isFalse(); + PartitionSpecEvolution.PartitionSpecChanges result = + PartitionSpecEvolution.evolve(spec1, spec2); + + assertThat(result.termsToAdd().toString()).isEqualTo("[bucket[23](ref(name=\"id\"))]"); + assertThat(result.termsToRemove().toString()).isEqualTo("[]"); + } + + @Test + void testPartitionSpecEvolutionRemoveField() { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.IntegerType.get()), + Types.NestedField.required(1, "data", Types.StringType.get())); + + PartitionSpec spec1 = PartitionSpec.builderFor(schema).bucket("id", 23).build(); + // Remove field + PartitionSpec spec2 = PartitionSpec.builderFor(schema).build(); + + assertThat(PartitionSpecEvolution.checkCompatibility(spec1, spec2)).isFalse(); + PartitionSpecEvolution.PartitionSpecChanges result = + PartitionSpecEvolution.evolve(spec1, spec2); + + assertThat(result.termsToAdd().toString()).isEqualTo("[]"); + assertThat(result.termsToRemove().toString()).isEqualTo("[bucket[23](ref(name=\"id\"))]"); + } + + @Test + void testPartitionSpecEvolutionWithNestedFields() { + Schema schema1 = + new Schema( + Types.NestedField.required(0, "id", Types.IntegerType.get()), + Types.NestedField.required( + 1, + "data", + Types.StructType.of(Types.NestedField.required(2, "str", Types.StringType.get())))); + + PartitionSpec spec1 = PartitionSpec.builderFor(schema1).bucket("data.str", 10).build(); + + Schema schema2 = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required( + 2, + "data", + Types.StructType.of(Types.NestedField.required(3, "str", Types.StringType.get())))); + + // Change num buckets + PartitionSpec spec2 = PartitionSpec.builderFor(schema2).bucket("data.str", 23).build(); + + assertThat(PartitionSpecEvolution.checkCompatibility(spec1, spec2)).isFalse(); + PartitionSpecEvolution.PartitionSpecChanges result = + PartitionSpecEvolution.evolve(spec1, spec2); + + assertThat(result.termsToAdd().toString()).isEqualTo("[bucket[23](ref(name=\"data.str\"))]"); + assertThat(result.termsToRemove().toString()).isEqualTo("[bucket[10](ref(name=\"data.str\"))]"); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestRowDataConverter.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestRowDataConverter.java new file mode 100644 index 000000000000..c4a86bb79e4a --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestRowDataConverter.java @@ -0,0 +1,262 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.math.BigDecimal; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.DataGenerator; +import org.apache.iceberg.flink.DataGenerators; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Types; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.joda.time.Days; +import org.junit.jupiter.api.Test; + +class TestRowDataConverter { + + static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); + + static final Schema SCHEMA2 = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "onemore", Types.DoubleType.get())); + + @Test + void testPrimitiveTypes() { + DataGenerator generator = new DataGenerators.Primitives(); + assertThat( + convert( + generator.generateFlinkRowData(), + generator.icebergSchema(), + generator.icebergSchema())) + .isEqualTo(generator.generateFlinkRowData()); + } + + @Test + void testAddColumn() { + assertThat(convert(SimpleDataUtil.createRowData(1, "a"), SCHEMA, SCHEMA2)) + .isEqualTo(GenericRowData.of(1, StringData.fromString("a"), null)); + } + + @Test + void testAddRequiredColumn() { + Schema currentSchema = new Schema(Types.NestedField.optional(1, "id", Types.IntegerType.get())); + Schema targetSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + required(2, "data", Types.StringType.get())); + + assertThatThrownBy(() -> convert(GenericRowData.of(42), currentSchema, targetSchema)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("is non-nullable but does not exist in source schema"); + } + + @Test + void testIntToLong() { + Schema schemaWithLong = + new Schema( + Types.NestedField.optional(2, "id", Types.LongType.get()), + Types.NestedField.optional(4, "data", Types.StringType.get())); + + assertThat(convert(SimpleDataUtil.createRowData(1, "a"), SimpleDataUtil.SCHEMA, schemaWithLong)) + .isEqualTo(GenericRowData.of(1L, StringData.fromString("a"))); + } + + @Test + void testFloatToDouble() { + Schema schemaWithFloat = + new Schema(Types.NestedField.optional(1, "float2double", Types.FloatType.get())); + Schema schemaWithDouble = + new Schema(Types.NestedField.optional(2, "float2double", Types.DoubleType.get())); + + assertThat(convert(GenericRowData.of(1.5f), schemaWithFloat, schemaWithDouble)) + .isEqualTo(GenericRowData.of(1.5d)); + } + + @Test + void testDateToTimestamp() { + Schema schemaWithFloat = + new Schema(Types.NestedField.optional(1, "date2timestamp", Types.DateType.get())); + Schema schemaWithDouble = + new Schema( + Types.NestedField.optional(2, "date2timestamp", Types.TimestampType.withoutZone())); + + DateTime time = new DateTime(2022, 1, 10, 0, 0, 0, 0, DateTimeZone.UTC); + int days = + Days.daysBetween(new DateTime(1970, 1, 1, 0, 0, 0, 0, DateTimeZone.UTC), time).getDays(); + + assertThat(convert(GenericRowData.of(days), schemaWithFloat, schemaWithDouble)) + .isEqualTo(GenericRowData.of(TimestampData.fromEpochMillis(time.getMillis()))); + } + + @Test + void testIncreasePrecision() { + Schema before = + new Schema(Types.NestedField.required(14, "decimal_field", Types.DecimalType.of(9, 2))); + Schema after = + new Schema(Types.NestedField.required(14, "decimal_field", Types.DecimalType.of(10, 2))); + + assertThat( + convert( + GenericRowData.of(DecimalData.fromBigDecimal(new BigDecimal("-1.50"), 9, 2)), + before, + after)) + .isEqualTo(GenericRowData.of(DecimalData.fromBigDecimal(new BigDecimal("-1.50"), 10, 2))); + } + + @Test + void testStructAddOptionalFields() { + DataGenerator generator = new DataGenerators.StructOfPrimitive(); + RowData oldData = generator.generateFlinkRowData(); + Schema oldSchema = generator.icebergSchema(); + Types.NestedField structField = oldSchema.columns().get(1); + Schema newSchema = + new Schema( + oldSchema.columns().get(0), + Types.NestedField.required( + 10, + structField.name(), + Types.StructType.of( + required(101, "id", Types.IntegerType.get()), + optional(103, "optional", Types.StringType.get()), + required(102, "name", Types.StringType.get())))); + RowData newData = + GenericRowData.of( + StringData.fromString("row_id_value"), + GenericRowData.of(1, null, StringData.fromString("Jane"))); + + assertThat(convert(oldData, oldSchema, newSchema)).isEqualTo(newData); + } + + @Test + void testStructAddRequiredFieldsWithOptionalRoot() { + DataGenerator generator = new DataGenerators.StructOfPrimitive(); + RowData oldData = generator.generateFlinkRowData(); + Schema oldSchema = generator.icebergSchema(); + Types.NestedField structField = oldSchema.columns().get(1); + Schema newSchema = + new Schema( + oldSchema.columns().get(0), + Types.NestedField.optional( + 10, + "newFieldOptionalField", + Types.StructType.of( + Types.NestedField.optional( + structField.fieldId(), + structField.name(), + Types.StructType.of( + optional(101, "id", Types.IntegerType.get()), + // Required columns which leads to nulling the entire struct + required(103, "required", Types.StringType.get()), + required(102, "name", Types.StringType.get())))))); + + RowData expectedData = GenericRowData.of(StringData.fromString("row_id_value"), null); + + assertThat(convert(oldData, oldSchema, newSchema)).isEqualTo(expectedData); + } + + @Test + void testStructAddRequiredFields() { + DataGenerator generator = new DataGenerators.StructOfPrimitive(); + RowData oldData = generator.generateFlinkRowData(); + Schema oldSchema = generator.icebergSchema(); + Types.NestedField structField = oldSchema.columns().get(1); + Schema newSchema = + new Schema( + oldSchema.columns().get(0), + Types.NestedField.required( + 10, + structField.name(), + Types.StructType.of( + required(101, "id", Types.IntegerType.get()), + required(103, "required", Types.StringType.get()), + required(102, "name", Types.StringType.get())))); + + assertThatThrownBy(() -> convert(oldData, oldSchema, newSchema)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("is non-nullable but does not exist in source schema"); + } + + @Test + void testMap() { + DataGenerator generator = new DataGenerators.MapOfPrimitives(); + RowData oldData = generator.generateFlinkRowData(); + Schema oldSchema = generator.icebergSchema(); + Types.NestedField mapField = oldSchema.columns().get(1); + Schema newSchema = + new Schema( + oldSchema.columns().get(0), + Types.NestedField.optional( + 10, + mapField.name(), + Types.MapType.ofRequired(101, 102, Types.StringType.get(), Types.LongType.get()))); + RowData newData = + GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericMapData( + ImmutableMap.of( + StringData.fromString("Jane"), 1L, StringData.fromString("Joe"), 2L))); + + assertThat(convert(oldData, oldSchema, newSchema)).isEqualTo(newData); + } + + @Test + void testArray() { + DataGenerator generator = new DataGenerators.ArrayOfPrimitive(); + RowData oldData = generator.generateFlinkRowData(); + Schema oldSchema = generator.icebergSchema(); + Types.NestedField arrayField = oldSchema.columns().get(1); + Schema newSchema = + new Schema( + oldSchema.columns().get(0), + Types.NestedField.optional( + 10, arrayField.name(), Types.ListType.ofOptional(101, Types.LongType.get()))); + RowData newData = + GenericRowData.of( + StringData.fromString("row_id_value"), new GenericArrayData(new Long[] {1L, 2L, 3L})); + + assertThat(convert(oldData, oldSchema, newSchema)).isEqualTo(newData); + } + + private static RowData convert(RowData sourceData, Schema sourceSchema, Schema targetSchema) { + return (RowData) + DataConverter.get( + FlinkSchemaUtil.convert(sourceSchema), FlinkSchemaUtil.convert(targetSchema)) + .convert(sourceData); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableMetadataCache.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableMetadataCache.java new file mode 100644 index 000000000000..2264cc3a8db0 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableMetadataCache.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.commons.lang3.SerializationUtils; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.sink.TestFlinkIcebergSinkBase; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestTableMetadataCache extends TestFlinkIcebergSinkBase { + + static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); + + static final Schema SCHEMA2 = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "extra", Types.StringType.get())); + + @Test + void testCaching() { + Catalog catalog = CATALOG_EXTENSION.catalog(); + TableIdentifier tableIdentifier = TableIdentifier.parse("default.myTable"); + catalog.createTable(tableIdentifier, SCHEMA); + TableMetadataCache cache = new TableMetadataCache(catalog, 10, Long.MAX_VALUE, 10); + + Schema schema1 = cache.schema(tableIdentifier, SCHEMA).resolvedTableSchema(); + assertThat(schema1.sameSchema(SCHEMA)).isTrue(); + assertThat( + cache.schema(tableIdentifier, SerializationUtils.clone(SCHEMA)).resolvedTableSchema()) + .isEqualTo(schema1); + + assertThat(cache.schema(tableIdentifier, SCHEMA2)).isEqualTo(TableMetadataCache.NOT_FOUND); + + schema1 = cache.schema(tableIdentifier, SCHEMA).resolvedTableSchema(); + assertThat( + cache.schema(tableIdentifier, SerializationUtils.clone(SCHEMA)).resolvedTableSchema()) + .isEqualTo(schema1); + } + + @Test + void testCacheInvalidationAfterSchemaChange() { + Catalog catalog = CATALOG_EXTENSION.catalog(); + TableIdentifier tableIdentifier = TableIdentifier.parse("default.myTable"); + catalog.createTable(tableIdentifier, SCHEMA); + TableMetadataCache cache = new TableMetadataCache(catalog, 10, Long.MAX_VALUE, 10); + TableUpdater tableUpdater = new TableUpdater(cache, catalog); + + Schema schema1 = cache.schema(tableIdentifier, SCHEMA).resolvedTableSchema(); + assertThat(schema1.sameSchema(SCHEMA)).isTrue(); + + catalog.dropTable(tableIdentifier); + catalog.createTable(tableIdentifier, SCHEMA2); + tableUpdater.update(tableIdentifier, "main", SCHEMA2, PartitionSpec.unpartitioned()); + + Schema schema2 = cache.schema(tableIdentifier, SCHEMA2).resolvedTableSchema(); + assertThat(schema2.sameSchema(SCHEMA2)).isTrue(); + } + + @Test + void testCachingDisabled() { + Catalog catalog = CATALOG_EXTENSION.catalog(); + TableIdentifier tableIdentifier = TableIdentifier.parse("default.myTable"); + catalog.createTable(tableIdentifier, SCHEMA); + TableMetadataCache cache = new TableMetadataCache(catalog, 0, Long.MAX_VALUE, 10); + + assertThat(cache.getInternalCache()).isEmpty(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableSerializerCache.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableSerializerCache.java new file mode 100644 index 000000000000..1cf2c8bae001 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableSerializerCache.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.apache.iceberg.types.Types.DoubleType; +import static org.apache.iceberg.types.Types.LongType; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.apache.iceberg.types.Types.StringType; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.function.Supplier; +import org.apache.flink.api.java.tuple.Tuple3; +import org.apache.flink.table.runtime.typeutils.RowDataSerializer; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.CatalogLoader; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; + +class TestTableSerializerCache { + + @RegisterExtension + static final HadoopCatalogExtension CATALOG_EXTENSION = new HadoopCatalogExtension("db", "table"); + + Schema schema1 = new Schema(23, required(1, "id", LongType.get())); + + Schema schema2 = + new Schema( + 42, + required(1, "id", LongType.get()), + optional(2, "data", StringType.get()), + optional(3, "double", DoubleType.get())); + + TableSerializerCache cache = new TableSerializerCache(CATALOG_EXTENSION.catalogLoader(), 10); + + @Test + void testFullSchemaCaching() { + Supplier creator1a = + () -> cache.serializer("table", schema1, PartitionSpec.unpartitioned()); + Supplier creator1b = + () -> cache.serializer("table", schema2, PartitionSpec.unpartitioned()); + Supplier creator2 = + () -> cache.serializer("table2", schema2, PartitionSpec.unpartitioned()); + + RowDataSerializer serializer1a = creator1a.get(); + RowDataSerializer serializer1b = creator1b.get(); + RowDataSerializer serializer2 = creator2.get(); + assertThat(serializer1a).isNotSameAs(serializer1b).isNotSameAs(serializer2); + + assertThat(serializer1a).isSameAs(creator1a.get()); + assertThat(serializer1b).isSameAs(creator1b.get()); + assertThat(serializer2).isSameAs(creator2.get()); + } + + @Test + void testCachingWithSchemaLookup() { + CatalogLoader catalogLoader = CATALOG_EXTENSION.catalogLoader(); + cache = new TableSerializerCache(catalogLoader, 10); + + Catalog catalog = catalogLoader.loadCatalog(); + Table table = catalog.createTable(TableIdentifier.of("table"), schema1); + + Tuple3 serializerWithSchemaAndSpec = + cache.serializerWithSchemaAndSpec( + "table", table.schema().schemaId(), PartitionSpec.unpartitioned().specId()); + assertThat(serializerWithSchemaAndSpec).isNotNull(); + assertThat(serializerWithSchemaAndSpec.f0).isNotNull(); + assertThat(serializerWithSchemaAndSpec.f1.sameSchema(table.schema())).isTrue(); + assertThat(serializerWithSchemaAndSpec.f2).isEqualTo(table.spec()); + + Tuple3 serializerWithSchemaAndSpec2 = + cache.serializerWithSchemaAndSpec( + "table", table.schema().schemaId(), PartitionSpec.unpartitioned().specId()); + + assertThat(serializerWithSchemaAndSpec.f0).isSameAs(serializerWithSchemaAndSpec2.f0); + assertThat(serializerWithSchemaAndSpec.f1).isSameAs(serializerWithSchemaAndSpec2.f1); + assertThat(serializerWithSchemaAndSpec.f2).isSameAs(serializerWithSchemaAndSpec2.f2); + } + + @Test + void testCacheEviction() { + cache = new TableSerializerCache(CATALOG_EXTENSION.catalogLoader(), 0); + assertThat(cache.maximumSize()).isEqualTo(0); + + Supplier creator1 = + () -> cache.serializer("table", schema1, PartitionSpec.unpartitioned()); + Supplier creator2 = + () -> cache.serializer("table2", schema2, PartitionSpec.unpartitioned()); + + RowDataSerializer serializer1 = creator1.get(); + RowDataSerializer serializer2 = creator2.get(); + + cache.getCache().clear(); + assertThat(serializer1).isNotSameAs(creator1.get()); + assertThat(serializer2).isNotSameAs(creator2.get()); + } + + @Test + void testCacheSize() { + cache = new TableSerializerCache(CATALOG_EXTENSION.catalogLoader(), 1000); + assertThat(cache.maximumSize()).isEqualTo(1000); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableUpdater.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableUpdater.java new file mode 100644 index 000000000000..ad35d929728d --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableUpdater.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Map; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.sink.TestFlinkIcebergSinkBase; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestTableUpdater extends TestFlinkIcebergSinkBase { + + static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); + + static final Schema SCHEMA2 = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "extra", Types.StringType.get())); + + @Test + void testTableCreation() { + Catalog catalog = CATALOG_EXTENSION.catalog(); + TableIdentifier tableIdentifier = TableIdentifier.parse("myTable"); + TableMetadataCache cache = new TableMetadataCache(catalog, 10, Long.MAX_VALUE, 10); + TableUpdater tableUpdater = new TableUpdater(cache, catalog); + + tableUpdater.update(tableIdentifier, "main", SCHEMA, PartitionSpec.unpartitioned()); + assertThat(catalog.tableExists(tableIdentifier)).isTrue(); + + TableMetadataCache.ResolvedSchemaInfo cachedSchema = cache.schema(tableIdentifier, SCHEMA); + assertThat(cachedSchema.resolvedTableSchema().sameSchema(SCHEMA)).isTrue(); + } + + @Test + void testTableAlreadyExists() { + Catalog catalog = CATALOG_EXTENSION.catalog(); + TableIdentifier tableIdentifier = TableIdentifier.parse("myTable"); + TableMetadataCache cache = new TableMetadataCache(catalog, 10, Long.MAX_VALUE, 10); + TableUpdater tableUpdater = new TableUpdater(cache, catalog); + + // Make the table non-existent in cache + cache.exists(tableIdentifier); + // Create the table + catalog.createTable(tableIdentifier, SCHEMA); + // Make sure that the cache is invalidated and the table refreshed without an error + Tuple2 result = + tableUpdater.update(tableIdentifier, "main", SCHEMA, PartitionSpec.unpartitioned()); + assertThat(result.f0.resolvedTableSchema().sameSchema(SCHEMA)).isTrue(); + assertThat(result.f0.compareResult()).isEqualTo(CompareSchemasVisitor.Result.SAME); + assertThat(result.f1).isEqualTo(PartitionSpec.unpartitioned()); + } + + @Test + void testBranchCreationAndCaching() { + Catalog catalog = CATALOG_EXTENSION.catalog(); + TableIdentifier tableIdentifier = TableIdentifier.parse("myTable"); + TableMetadataCache cache = new TableMetadataCache(catalog, 10, Long.MAX_VALUE, 10); + TableUpdater tableUpdater = new TableUpdater(cache, catalog); + + catalog.createTable(tableIdentifier, SCHEMA); + tableUpdater.update(tableIdentifier, "myBranch", SCHEMA, PartitionSpec.unpartitioned()); + TableMetadataCache.CacheItem cacheItem = cache.getInternalCache().get(tableIdentifier); + assertThat(cacheItem).isNotNull(); + + tableUpdater.update(tableIdentifier, "myBranch", SCHEMA, PartitionSpec.unpartitioned()); + assertThat(cache.getInternalCache()).contains(Map.entry(tableIdentifier, cacheItem)); + } + + @Test + void testSpecCreation() { + Catalog catalog = CATALOG_EXTENSION.catalog(); + TableIdentifier tableIdentifier = TableIdentifier.parse("myTable"); + TableMetadataCache cache = new TableMetadataCache(catalog, 10, Long.MAX_VALUE, 10); + TableUpdater tableUpdater = new TableUpdater(cache, catalog); + + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).bucket("data", 10).build(); + tableUpdater.update(tableIdentifier, "main", SCHEMA, spec); + + Table table = catalog.loadTable(tableIdentifier); + assertThat(table).isNotNull(); + assertThat(table.spec()).isEqualTo(spec); + } + + @Test + void testInvalidateOldCacheEntryOnUpdate() { + Catalog catalog = CATALOG_EXTENSION.catalog(); + TableIdentifier tableIdentifier = TableIdentifier.parse("default.myTable"); + catalog.createTable(tableIdentifier, SCHEMA); + TableMetadataCache cache = new TableMetadataCache(catalog, 10, Long.MAX_VALUE, 10); + cache.schema(tableIdentifier, SCHEMA); + TableUpdater tableUpdater = new TableUpdater(cache, catalog); + + Schema updated = + tableUpdater + .update(tableIdentifier, "main", SCHEMA2, PartitionSpec.unpartitioned()) + .f0 + .resolvedTableSchema(); + assertThat(updated.sameSchema(SCHEMA2)).isTrue(); + assertThat(cache.schema(tableIdentifier, SCHEMA2).resolvedTableSchema().sameSchema(SCHEMA2)) + .isTrue(); + } + + @Test + void testLastResultInvalidation() { + Catalog catalog = CATALOG_EXTENSION.catalog(); + TableIdentifier tableIdentifier = TableIdentifier.parse("default.myTable"); + catalog.createTable(tableIdentifier, SCHEMA); + TableMetadataCache cache = new TableMetadataCache(catalog, 10, Long.MAX_VALUE, 10); + TableUpdater tableUpdater = new TableUpdater(cache, catalog); + + // Initialize cache + tableUpdater.update(tableIdentifier, "main", SCHEMA, PartitionSpec.unpartitioned()); + + // Update table behind the scenes + catalog.dropTable(tableIdentifier); + catalog.createTable(tableIdentifier, SCHEMA2); + + // Cache still stores the old information + assertThat(cache.schema(tableIdentifier, SCHEMA2).compareResult()) + .isEqualTo(CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED); + + assertThat( + tableUpdater + .update(tableIdentifier, "main", SCHEMA2, PartitionSpec.unpartitioned()) + .f0 + .compareResult()) + .isEqualTo(CompareSchemasVisitor.Result.SAME); + + // Last result cache should be cleared + assertThat(cache.getInternalCache().get(tableIdentifier).inputSchemas()) + .doesNotContainKey(SCHEMA2); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java new file mode 100644 index 000000000000..b0d98b358b6d --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java @@ -0,0 +1,178 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.UUID; +import java.util.concurrent.ThreadLocalRandom; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +public class DataDistributionUtil { + private DataDistributionUtil() {} + + private static final String CHARS = + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-.!?"; + + /** Generate a random string with a given prefix and a random length up to maxLength. */ + public static String randomString(String prefix, int maxLength) { + int length = ThreadLocalRandom.current().nextInt(maxLength); + byte[] buffer = new byte[length]; + + for (int i = 0; i < length; i += 1) { + buffer[i] = (byte) CHARS.charAt(ThreadLocalRandom.current().nextInt(CHARS.length())); + } + + return prefix + new String(buffer, StandardCharsets.UTF_8); + } + + /** + * return index if index == 0 && weightsUDF[index] > target (or) weightsUDF[index-1] <= target && + * weightsUDF[index] > target + */ + public static int binarySearchIndex(long[] weightsCDF, long target) { + Preconditions.checkArgument( + target >= 0, "target weight must be non-negative: search target = %s", target); + Preconditions.checkArgument( + target < weightsCDF[weightsCDF.length - 1], + "target weight is out of range: total weight = %s, search target = %s", + weightsCDF[weightsCDF.length - 1], + target); + + int start = 0; + int end = weightsCDF.length - 1; + while (start <= end) { + int mid = (start + end) / 2; + boolean leftOk = (mid == 0) || (weightsCDF[mid - 1] <= target); + boolean rightOk = weightsCDF[mid] > target; + if (leftOk && rightOk) { + return mid; + } else if (weightsCDF[mid] <= target) { + start = mid + 1; + } else { + end = mid - 1; + } + } + + throw new IllegalStateException("should never reach here"); + } + + /** Key is the id string and value is the weight in long value. */ + public static NavigableMap longTailDistribution( + long startingWeight, + int longTailStartingIndex, + int longTailLength, + long longTailBaseWeight, + double weightRandomJitterPercentage, + double decayFactor) { + + NavigableMap weights = Maps.newTreeMap(); + + // decay part + long currentWeight = startingWeight; + for (int index = 0; index < longTailStartingIndex; ++index) { + double jitter = ThreadLocalRandom.current().nextDouble(weightRandomJitterPercentage / 100); + long weight = (long) (currentWeight * (1.0 + jitter)); + weight = weight > 0 ? weight : 1; + weights.put(index, weight); + if (currentWeight > longTailBaseWeight) { + currentWeight = (long) (currentWeight * decayFactor); // decay the weight by 40% + } + } + + // long tail part (flat with some random jitter) + for (int index = longTailStartingIndex; + index < longTailStartingIndex + longTailLength; + ++index) { + long longTailWeight = + (long) + (longTailBaseWeight + * ThreadLocalRandom.current().nextDouble(weightRandomJitterPercentage)); + longTailWeight = longTailWeight > 0 ? longTailWeight : 1; + weights.put(index, longTailWeight); + } + + return weights; + } + + public static Map mapStatisticsWithLongTailDistribution( + NavigableMap weights, SortKey sortKey) { + Map mapStatistics = Maps.newHashMapWithExpectedSize(weights.size()); + weights.forEach( + (id, weight) -> { + SortKey sortKeyCopy = sortKey.copy(); + sortKeyCopy.set(0, id); + mapStatistics.put(sortKeyCopy, weight); + }); + + return mapStatistics; + } + + public static long[] computeCumulativeWeights(List keys, Map weights) { + long[] weightsCDF = new long[keys.size()]; + long totalWeight = 0; + for (int i = 0; i < keys.size(); ++i) { + totalWeight += weights.get(keys.get(i)); + weightsCDF[i] = totalWeight; + } + + return weightsCDF; + } + + public static byte[] uuidBytes(UUID uuid) { + ByteBuffer bb = ByteBuffer.wrap(new byte[16]); + bb.putLong(uuid.getMostSignificantBits()); + bb.putLong(uuid.getLeastSignificantBits()); + return bb.array(); + } + + public static UUID[] reservoirSampleUUIDs(int sampleSize, int reservoirSize) { + UUID[] reservoir = new UUID[reservoirSize]; + for (int i = 0; i < reservoirSize; ++i) { + reservoir[i] = UUID.randomUUID(); + } + + ThreadLocalRandom random = ThreadLocalRandom.current(); + for (int i = reservoirSize; i < sampleSize; ++i) { + int rand = random.nextInt(i + 1); + if (rand < reservoirSize) { + reservoir[rand] = UUID.randomUUID(); + } + } + + Arrays.sort(reservoir); + return reservoir; + } + + public static UUID[] rangeBoundSampleUUIDs(UUID[] sampledUUIDs, int rangeBoundSize) { + UUID[] rangeBounds = new UUID[rangeBoundSize]; + int step = sampledUUIDs.length / rangeBoundSize; + for (int i = 0; i < rangeBoundSize; ++i) { + rangeBounds[i] = sampledUUIDs[i * step]; + } + Arrays.sort(rangeBounds); + return rangeBounds; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java new file mode 100644 index 000000000000..5910bd685510 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Comparator; +import java.util.Map; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.RowDataSerializer; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.VarCharType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.SortOrderComparators; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; + +class Fixtures { + private Fixtures() {} + + public static final int NUM_SUBTASKS = 2; + public static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.StringType.get()), + Types.NestedField.optional(2, "number", Types.IntegerType.get())); + public static final RowType ROW_TYPE = RowType.of(new VarCharType(), new IntType()); + public static final TypeSerializer ROW_SERIALIZER = new RowDataSerializer(ROW_TYPE); + public static final RowDataWrapper ROW_WRAPPER = new RowDataWrapper(ROW_TYPE, SCHEMA.asStruct()); + public static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); + public static final Comparator SORT_ORDER_COMPARTOR = + SortOrderComparators.forSchema(SCHEMA, SORT_ORDER); + public static final SortKeySerializer SORT_KEY_SERIALIZER = + new SortKeySerializer(SCHEMA, SORT_ORDER); + public static final DataStatisticsSerializer TASK_STATISTICS_SERIALIZER = + new DataStatisticsSerializer(SORT_KEY_SERIALIZER); + public static final GlobalStatisticsSerializer GLOBAL_STATISTICS_SERIALIZER = + new GlobalStatisticsSerializer(SORT_KEY_SERIALIZER); + public static final CompletedStatisticsSerializer COMPLETED_STATISTICS_SERIALIZER = + new CompletedStatisticsSerializer(SORT_KEY_SERIALIZER); + + public static final SortKey SORT_KEY = new SortKey(SCHEMA, SORT_ORDER); + public static final Map CHAR_KEYS = createCharKeys(); + + public static StatisticsEvent createStatisticsEvent( + StatisticsType type, + TypeSerializer statisticsSerializer, + long checkpointId, + SortKey... keys) { + DataStatistics statistics = createTaskStatistics(type, keys); + return StatisticsEvent.createTaskStatisticsEvent( + checkpointId, statistics, statisticsSerializer); + } + + public static DataStatistics createTaskStatistics(StatisticsType type, SortKey... keys) { + DataStatistics statistics; + if (type == StatisticsType.Sketch) { + statistics = new SketchDataStatistics(128); + } else { + statistics = new MapDataStatistics(); + } + + for (SortKey key : keys) { + statistics.add(key); + } + + return statistics; + } + + private static Map createCharKeys() { + Map keys = Maps.newHashMap(); + for (char c = 'a'; c <= 'z'; ++c) { + String key = Character.toString(c); + SortKey sortKey = SORT_KEY.copy(); + sortKey.set(0, key); + keys.put(key, sortKey); + } + + return keys; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java new file mode 100644 index 000000000000..8322ce683768 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java @@ -0,0 +1,465 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.TASK_STATISTICS_SERIALIZER; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.createStatisticsEvent; +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +public class TestAggregatedStatisticsTracker { + @ParameterizedTest + @EnumSource(StatisticsType.class) + public void receiveNewerStatisticsEvent(StatisticsType type) { + AggregatedStatisticsTracker tracker = createTracker(type); + + StatisticsEvent checkpoint1Subtask0StatisticsEvent = + createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("a")); + CompletedStatistics completedStatistics = + tracker.updateAndCheckCompletion(0, checkpoint1Subtask0StatisticsEvent); + assertThat(completedStatistics).isNull(); + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); + AggregatedStatisticsTracker.Aggregation aggregation = + tracker.aggregationsPerCheckpoint().get(1L); + assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L)); + } else { + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder(CHAR_KEYS.get("a")); + } + + StatisticsEvent checkpoint2Subtask0StatisticsEvent = + createStatisticsEvent( + type, + TASK_STATISTICS_SERIALIZER, + 2L, + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b")); + completedStatistics = tracker.updateAndCheckCompletion(0, checkpoint2Subtask0StatisticsEvent); + assertThat(completedStatistics).isNull(); + // both checkpoints are tracked + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L, 2L); + aggregation = tracker.aggregationsPerCheckpoint().get(2L); + assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(aggregation.mapStatistics()) + .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)); + } else { + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder(CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("b")); + } + + StatisticsEvent checkpoint1Subtask1StatisticsEvent = + createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("b")); + completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint1Subtask1StatisticsEvent); + // checkpoint 1 is completed + assertThat(completedStatistics).isNotNull(); + assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + assertThat(completedStatistics.checkpointId()).isEqualTo(1L); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(completedStatistics.keyFrequency()) + .isEqualTo( + ImmutableMap.of( + CHAR_KEYS.get("a"), 1L, + CHAR_KEYS.get("b"), 1L)); + } else { + assertThat(completedStatistics.keySamples()) + .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); + } + + // checkpoint 2 remains + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(2L); + aggregation = tracker.aggregationsPerCheckpoint().get(2L); + assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(aggregation.mapStatistics()) + .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)); + } else { + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder(CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("b")); + } + } + + @ParameterizedTest + @EnumSource(StatisticsType.class) + public void receiveOlderStatisticsEventTest(StatisticsType type) { + AggregatedStatisticsTracker tracker = createTracker(type); + + StatisticsEvent checkpoint2Subtask0StatisticsEvent = + createStatisticsEvent( + type, + TASK_STATISTICS_SERIALIZER, + 2L, + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b")); + CompletedStatistics completedStatistics = + tracker.updateAndCheckCompletion(0, checkpoint2Subtask0StatisticsEvent); + assertThat(completedStatistics).isNull(); + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(2L); + AggregatedStatisticsTracker.Aggregation aggregation = + tracker.aggregationsPerCheckpoint().get(2L); + assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(aggregation.mapStatistics()) + .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)); + } else { + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder(CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("b")); + } + + StatisticsEvent checkpoint1Subtask1StatisticsEvent = + createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("b")); + completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint1Subtask1StatisticsEvent); + assertThat(completedStatistics).isNull(); + // both checkpoints are tracked + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L, 2L); + aggregation = tracker.aggregationsPerCheckpoint().get(1L); + assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("b"), 1L)); + } else { + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder(CHAR_KEYS.get("b")); + } + + StatisticsEvent checkpoint3Subtask0StatisticsEvent = + createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 3L, CHAR_KEYS.get("x")); + completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint3Subtask0StatisticsEvent); + assertThat(completedStatistics).isNull(); + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L, 2L, 3L); + aggregation = tracker.aggregationsPerCheckpoint().get(3L); + assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("x"), 1L)); + } else { + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder(CHAR_KEYS.get("x")); + } + + StatisticsEvent checkpoint2Subtask1StatisticsEvent = + createStatisticsEvent( + type, + TASK_STATISTICS_SERIALIZER, + 2L, + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b")); + completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint2Subtask1StatisticsEvent); + // checkpoint 1 is cleared along with checkpoint 2. checkpoint 3 remains + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(3L); + aggregation = tracker.aggregationsPerCheckpoint().get(3L); + assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("x"), 1L)); + } else { + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder(CHAR_KEYS.get("x")); + } + + assertThat(completedStatistics).isNotNull(); + assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + assertThat(completedStatistics.checkpointId()).isEqualTo(2L); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(completedStatistics.keyFrequency()) + .isEqualTo( + ImmutableMap.of( + CHAR_KEYS.get("a"), 2L, + CHAR_KEYS.get("b"), 4L)); + } else { + assertThat(completedStatistics.keySamples()) + .containsExactly( + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b")); + } + } + + @ParameterizedTest + @EnumSource(StatisticsType.class) + public void receiveCompletedStatisticsEvent(StatisticsType type) { + AggregatedStatisticsTracker tracker = createTracker(type); + + StatisticsEvent checkpoint1Subtask0DataStatisticEvent = + createStatisticsEvent( + type, + TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b")); + + CompletedStatistics completedStatistics = + tracker.updateAndCheckCompletion(0, checkpoint1Subtask0DataStatisticEvent); + assertThat(completedStatistics).isNull(); + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); + AggregatedStatisticsTracker.Aggregation aggregation = + tracker.aggregationsPerCheckpoint().get(1L); + assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(aggregation.mapStatistics()) + .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)); + } else { + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder(CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("b")); + } + + StatisticsEvent checkpoint1Subtask1DataStatisticEvent = + createStatisticsEvent( + type, + TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("a"), + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b")); + + // Receive data statistics from all subtasks at checkpoint 1 + completedStatistics = + tracker.updateAndCheckCompletion(1, checkpoint1Subtask1DataStatisticEvent); + assertThat(tracker.aggregationsPerCheckpoint()).isEmpty(); + + assertThat(completedStatistics).isNotNull(); + assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + assertThat(completedStatistics.checkpointId()).isEqualTo(1L); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(completedStatistics.keyFrequency()) + .isEqualTo( + ImmutableMap.of( + CHAR_KEYS.get("a"), 3L, + CHAR_KEYS.get("b"), 3L)); + } else { + assertThat(completedStatistics.keySamples()) + .containsExactly( + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("a"), + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b")); + } + + StatisticsEvent checkpoint2Subtask0DataStatisticEvent = + createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 2L, CHAR_KEYS.get("a")); + completedStatistics = + tracker.updateAndCheckCompletion(0, checkpoint2Subtask0DataStatisticEvent); + assertThat(completedStatistics).isNull(); + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(2L); + aggregation = tracker.aggregationsPerCheckpoint().get(2L); + assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L)); + } else { + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder(CHAR_KEYS.get("a")); + } + + StatisticsEvent checkpoint2Subtask1DataStatisticEvent = + createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 2L, CHAR_KEYS.get("b")); + // Receive data statistics from all subtasks at checkpoint 2 + completedStatistics = + tracker.updateAndCheckCompletion(1, checkpoint2Subtask1DataStatisticEvent); + assertThat(tracker.aggregationsPerCheckpoint()).isEmpty(); + + assertThat(completedStatistics).isNotNull(); + assertThat(completedStatistics.checkpointId()).isEqualTo(2L); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(completedStatistics.keyFrequency()) + .isEqualTo( + ImmutableMap.of( + CHAR_KEYS.get("a"), 1L, + CHAR_KEYS.get("b"), 1L)); + } else { + assertThat(completedStatistics.keySamples()) + .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); + } + } + + @Test + public void coordinatorSwitchToSketchOverThreshold() { + int parallelism = 3; + int downstreamParallelism = 3; + int switchToSketchThreshold = 3; + AggregatedStatisticsTracker tracker = + new AggregatedStatisticsTracker( + "testOperator", + parallelism, + Fixtures.SCHEMA, + Fixtures.SORT_ORDER, + downstreamParallelism, + StatisticsType.Auto, + switchToSketchThreshold, + null); + + StatisticsEvent checkpoint1Subtask0StatisticsEvent = + createStatisticsEvent( + StatisticsType.Map, + TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b")); + CompletedStatistics completedStatistics = + tracker.updateAndCheckCompletion(0, checkpoint1Subtask0StatisticsEvent); + assertThat(completedStatistics).isNull(); + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); + AggregatedStatisticsTracker.Aggregation aggregation = + tracker.aggregationsPerCheckpoint().get(1L); + assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0); + assertThat(aggregation.currentType()).isEqualTo(StatisticsType.Map); + assertThat(aggregation.sketchStatistics()).isNull(); + assertThat(aggregation.mapStatistics()) + .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 1L)); + + StatisticsEvent checkpoint1Subtask1StatisticsEvent = + createStatisticsEvent( + StatisticsType.Map, + TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("c"), + CHAR_KEYS.get("d")); + completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint1Subtask1StatisticsEvent); + assertThat(completedStatistics).isNull(); + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); + aggregation = tracker.aggregationsPerCheckpoint().get(1L); + assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0, 1); + // converted to sketch statistics as map size is 4 (over the switch threshold of 3) + assertThat(aggregation.currentType()).isEqualTo(StatisticsType.Sketch); + assertThat(aggregation.mapStatistics()).isNull(); + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder( + CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("c"), CHAR_KEYS.get("d")); + + StatisticsEvent checkpoint1Subtask2StatisticsEvent = + createStatisticsEvent( + StatisticsType.Map, + TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("e"), + CHAR_KEYS.get("f")); + completedStatistics = tracker.updateAndCheckCompletion(2, checkpoint1Subtask2StatisticsEvent); + assertThat(tracker.aggregationsPerCheckpoint()).isEmpty(); + assertThat(completedStatistics).isNotNull(); + assertThat(completedStatistics.type()).isEqualTo(StatisticsType.Sketch); + assertThat(completedStatistics.keySamples()) + .containsExactly( + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("d"), + CHAR_KEYS.get("e"), + CHAR_KEYS.get("f")); + } + + @Test + public void coordinatorMapOperatorSketch() { + int parallelism = 3; + int downstreamParallelism = 3; + AggregatedStatisticsTracker tracker = + new AggregatedStatisticsTracker( + "testOperator", + parallelism, + Fixtures.SCHEMA, + Fixtures.SORT_ORDER, + downstreamParallelism, + StatisticsType.Auto, + SketchUtil.COORDINATOR_SKETCH_SWITCH_THRESHOLD, + null); + + // first operator event has map statistics + StatisticsEvent checkpoint1Subtask0StatisticsEvent = + createStatisticsEvent( + StatisticsType.Map, + TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b")); + CompletedStatistics completedStatistics = + tracker.updateAndCheckCompletion(0, checkpoint1Subtask0StatisticsEvent); + assertThat(completedStatistics).isNull(); + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); + AggregatedStatisticsTracker.Aggregation aggregation = + tracker.aggregationsPerCheckpoint().get(1L); + assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0); + assertThat(aggregation.currentType()).isEqualTo(StatisticsType.Map); + assertThat(aggregation.sketchStatistics()).isNull(); + assertThat(aggregation.mapStatistics()) + .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 1L)); + + // second operator event contains sketch statistics + StatisticsEvent checkpoint1Subtask1StatisticsEvent = + createStatisticsEvent( + StatisticsType.Sketch, + TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("c"), + CHAR_KEYS.get("d")); + completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint1Subtask1StatisticsEvent); + assertThat(completedStatistics).isNull(); + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); + aggregation = tracker.aggregationsPerCheckpoint().get(1L); + assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0, 1); + assertThat(aggregation.currentType()).isEqualTo(StatisticsType.Sketch); + assertThat(aggregation.mapStatistics()).isNull(); + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder( + CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("c"), CHAR_KEYS.get("d")); + + // third operator event has Map statistics + StatisticsEvent checkpoint1Subtask2StatisticsEvent = + createStatisticsEvent( + StatisticsType.Map, + TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("e"), + CHAR_KEYS.get("f")); + completedStatistics = tracker.updateAndCheckCompletion(2, checkpoint1Subtask2StatisticsEvent); + assertThat(tracker.aggregationsPerCheckpoint()).isEmpty(); + assertThat(completedStatistics).isNotNull(); + assertThat(completedStatistics.type()).isEqualTo(StatisticsType.Sketch); + assertThat(completedStatistics.keySamples()) + .containsExactly( + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("d"), + CHAR_KEYS.get("e"), + CHAR_KEYS.get("f")); + } + + private AggregatedStatisticsTracker createTracker(StatisticsType type) { + return new AggregatedStatisticsTracker( + "testOperator", + Fixtures.NUM_SUBTASKS, + Fixtures.SCHEMA, + Fixtures.SORT_ORDER, + Fixtures.NUM_SUBTASKS, + type, + SketchUtil.COORDINATOR_SKETCH_SWITCH_THRESHOLD, + null); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java new file mode 100644 index 000000000000..1975d7e8d654 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; + +import org.apache.flink.api.common.typeutils.SerializerTestBase; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.Test; + +public class TestCompletedStatisticsSerializer extends SerializerTestBase { + + @Override + protected TypeSerializer createSerializer() { + return Fixtures.COMPLETED_STATISTICS_SERIALIZER; + } + + @Override + protected int getLength() { + return -1; + } + + @Override + protected Class getTypeClass() { + return CompletedStatistics.class; + } + + @Override + protected CompletedStatistics[] getTestData() { + + return new CompletedStatistics[] { + CompletedStatistics.fromKeyFrequency( + 1L, ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)), + CompletedStatistics.fromKeySamples(2L, new SortKey[] {CHAR_KEYS.get("a"), CHAR_KEYS.get("b")}) + }; + } + + @Test + public void testSerializer() throws Exception { + TypeSerializer completedStatisticsTypeSerializer = createSerializer(); + CompletedStatistics[] data = getTestData(); + DataOutputSerializer output = new DataOutputSerializer(1024); + completedStatisticsTypeSerializer.serialize(data[0], output); + byte[] serializedBytes = output.getCopyOfBuffer(); + + DataInputDeserializer input = new DataInputDeserializer(serializedBytes); + CompletedStatistics deserialized = completedStatisticsTypeSerializer.deserialize(input); + assertThat(deserialized).isEqualTo(data[0]); + } + + @Test + public void testRestoreOldVersionSerializer() throws Exception { + CompletedStatisticsSerializer completedStatisticsTypeSerializer = + (CompletedStatisticsSerializer) createSerializer(); + completedStatisticsTypeSerializer.changeSortKeySerializerVersion(1); + CompletedStatistics[] data = getTestData(); + DataOutputSerializer output = new DataOutputSerializer(1024); + completedStatisticsTypeSerializer.serialize(data[0], output); + byte[] serializedBytes = output.getCopyOfBuffer(); + + completedStatisticsTypeSerializer.changeSortKeySerializerVersionLatest(); + CompletedStatistics completedStatistics = + StatisticsUtil.deserializeCompletedStatistics( + serializedBytes, completedStatisticsTypeSerializer); + assertThat(completedStatistics).isEqualTo(data[0]); + } + + @Test + public void testRestoreNewSerializer() throws Exception { + CompletedStatisticsSerializer completedStatisticsTypeSerializer = + (CompletedStatisticsSerializer) createSerializer(); + CompletedStatistics[] data = getTestData(); + DataOutputSerializer output = new DataOutputSerializer(1024); + completedStatisticsTypeSerializer.serialize(data[0], output); + byte[] serializedBytes = output.getCopyOfBuffer(); + + CompletedStatistics completedStatistics = + StatisticsUtil.deserializeCompletedStatistics( + serializedBytes, completedStatisticsTypeSerializer); + assertThat(completedStatistics).isEqualTo(data[0]); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataDistributionUtil.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataDistributionUtil.java new file mode 100644 index 000000000000..a9dd1b5d8173 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataDistributionUtil.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.DataDistributionUtil.binarySearchIndex; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import org.junit.jupiter.api.Test; + +public class TestDataDistributionUtil { + @Test + public void testBinarySearchIndex() { + long[] weightsUDF = {10, 20, 30, 40, 50}; + assertThat(binarySearchIndex(weightsUDF, 0)).isEqualTo(0); + assertThat(binarySearchIndex(weightsUDF, 9)).isEqualTo(0); + assertThat(binarySearchIndex(weightsUDF, 10)).isEqualTo(1); + assertThat(binarySearchIndex(weightsUDF, 15)).isEqualTo(1); + assertThat(binarySearchIndex(weightsUDF, 20)).isEqualTo(2); + assertThat(binarySearchIndex(weightsUDF, 29)).isEqualTo(2); + assertThat(binarySearchIndex(weightsUDF, 30)).isEqualTo(3); + assertThat(binarySearchIndex(weightsUDF, 31)).isEqualTo(3); + assertThat(binarySearchIndex(weightsUDF, 40)).isEqualTo(4); + + // Test with a target that is out of range + assertThatThrownBy(() -> binarySearchIndex(weightsUDF, -1)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("target weight must be non-negative"); + assertThatThrownBy(() -> binarySearchIndex(weightsUDF, 50)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("target weight is out of range"); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java new file mode 100644 index 000000000000..0a6caf2aaa98 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java @@ -0,0 +1,373 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.NUM_SUBTASKS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.time.Duration; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.operators.coordination.EventReceivingTasks; +import org.apache.flink.runtime.operators.coordination.MockOperatorCoordinatorContext; +import org.apache.flink.util.ExceptionUtils; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +public class TestDataStatisticsCoordinator { + private static final String OPERATOR_NAME = "TestCoordinator"; + private static final OperatorID TEST_OPERATOR_ID = new OperatorID(1234L, 5678L); + + private EventReceivingTasks receivingTasks; + + @BeforeEach + public void before() throws Exception { + receivingTasks = EventReceivingTasks.createForRunningTasks(); + } + + private void tasksReady(DataStatisticsCoordinator coordinator) { + setAllTasksReady(NUM_SUBTASKS, coordinator, receivingTasks); + } + + @ParameterizedTest + @EnumSource(StatisticsType.class) + public void testThrowExceptionWhenNotStarted(StatisticsType type) throws Exception { + try (DataStatisticsCoordinator dataStatisticsCoordinator = createCoordinator(type)) { + String failureMessage = "The coordinator of TestCoordinator has not started yet."; + assertThatThrownBy( + () -> + dataStatisticsCoordinator.handleEventFromOperator( + 0, + 0, + StatisticsEvent.createTaskStatisticsEvent( + 0, new MapDataStatistics(), Fixtures.TASK_STATISTICS_SERIALIZER))) + .isInstanceOf(IllegalStateException.class) + .hasMessage(failureMessage); + assertThatThrownBy(() -> dataStatisticsCoordinator.executionAttemptFailed(0, 0, null)) + .isInstanceOf(IllegalStateException.class) + .hasMessage(failureMessage); + assertThatThrownBy(() -> dataStatisticsCoordinator.checkpointCoordinator(0, null)) + .isInstanceOf(IllegalStateException.class) + .hasMessage(failureMessage); + } + } + + @ParameterizedTest + @EnumSource(StatisticsType.class) + public void testDataStatisticsEventHandling(StatisticsType type) throws Exception { + try (DataStatisticsCoordinator dataStatisticsCoordinator = createCoordinator(type)) { + dataStatisticsCoordinator.start(); + tasksReady(dataStatisticsCoordinator); + + StatisticsEvent checkpoint1Subtask0DataStatisticEvent = + Fixtures.createStatisticsEvent( + type, + Fixtures.TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c")); + StatisticsEvent checkpoint1Subtask1DataStatisticEvent = + Fixtures.createStatisticsEvent( + type, + Fixtures.TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c")); + // Handle events from operators for checkpoint 1 + dataStatisticsCoordinator.handleEventFromOperator( + 0, 0, checkpoint1Subtask0DataStatisticEvent); + dataStatisticsCoordinator.handleEventFromOperator( + 1, 0, checkpoint1Subtask1DataStatisticEvent); + + waitForCoordinatorToProcessActions(dataStatisticsCoordinator); + + Map keyFrequency = + ImmutableMap.of( + CHAR_KEYS.get("a"), 2L, + CHAR_KEYS.get("b"), 3L, + CHAR_KEYS.get("c"), 5L); + MapAssignment mapAssignment = + MapAssignment.fromKeyFrequency(NUM_SUBTASKS, keyFrequency, 0.0d, SORT_ORDER_COMPARTOR); + + CompletedStatistics completedStatistics = dataStatisticsCoordinator.completedStatistics(); + assertThat(completedStatistics.checkpointId()).isEqualTo(1L); + assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(completedStatistics.keyFrequency()).isEqualTo(keyFrequency); + } else { + assertThat(completedStatistics.keySamples()) + .containsExactly( + CHAR_KEYS.get("a"), + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c")); + } + + GlobalStatistics globalStatistics = dataStatisticsCoordinator.globalStatistics(); + assertThat(globalStatistics.checkpointId()).isEqualTo(1L); + assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(globalStatistics.mapAssignment()).isEqualTo(mapAssignment); + } else { + assertThat(globalStatistics.rangeBounds()).containsExactly(CHAR_KEYS.get("b")); + } + } + } + + @ParameterizedTest + @EnumSource(StatisticsType.class) + public void testDataStatisticsEventHandlingWithNullValue(StatisticsType type) throws Exception { + try (DataStatisticsCoordinator dataStatisticsCoordinator = createCoordinator(type)) { + dataStatisticsCoordinator.start(); + tasksReady(dataStatisticsCoordinator); + + SortKey nullSortKey = Fixtures.SORT_KEY.copy(); + nullSortKey.set(0, null); + + StatisticsEvent checkpoint1Subtask0DataStatisticEvent = + Fixtures.createStatisticsEvent( + type, + Fixtures.TASK_STATISTICS_SERIALIZER, + 1L, + nullSortKey, + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c")); + StatisticsEvent checkpoint1Subtask1DataStatisticEvent = + Fixtures.createStatisticsEvent( + type, + Fixtures.TASK_STATISTICS_SERIALIZER, + 1L, + nullSortKey, + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c")); + // Handle events from operators for checkpoint 1 + dataStatisticsCoordinator.handleEventFromOperator( + 0, 0, checkpoint1Subtask0DataStatisticEvent); + dataStatisticsCoordinator.handleEventFromOperator( + 1, 0, checkpoint1Subtask1DataStatisticEvent); + + waitForCoordinatorToProcessActions(dataStatisticsCoordinator); + + Map keyFrequency = + ImmutableMap.of(nullSortKey, 2L, CHAR_KEYS.get("b"), 3L, CHAR_KEYS.get("c"), 5L); + MapAssignment mapAssignment = + MapAssignment.fromKeyFrequency(NUM_SUBTASKS, keyFrequency, 0.0d, SORT_ORDER_COMPARTOR); + + CompletedStatistics completedStatistics = dataStatisticsCoordinator.completedStatistics(); + assertThat(completedStatistics.checkpointId()).isEqualTo(1L); + assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(completedStatistics.keyFrequency()).isEqualTo(keyFrequency); + } else { + assertThat(completedStatistics.keySamples()) + .containsExactly( + nullSortKey, + nullSortKey, + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c")); + } + + GlobalStatistics globalStatistics = dataStatisticsCoordinator.globalStatistics(); + assertThat(globalStatistics.checkpointId()).isEqualTo(1L); + assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(globalStatistics.mapAssignment()).isEqualTo(mapAssignment); + } else { + assertThat(globalStatistics.rangeBounds()).containsExactly(CHAR_KEYS.get("b")); + } + } + } + + @Test + public void testRequestGlobalStatisticsEventHandling() throws Exception { + try (DataStatisticsCoordinator dataStatisticsCoordinator = + createCoordinator(StatisticsType.Sketch)) { + dataStatisticsCoordinator.start(); + tasksReady(dataStatisticsCoordinator); + + // receive request before global statistics is ready + dataStatisticsCoordinator.handleEventFromOperator(0, 0, new RequestGlobalStatisticsEvent()); + assertThat(receivingTasks.getSentEventsForSubtask(0)).isEmpty(); + assertThat(receivingTasks.getSentEventsForSubtask(1)).isEmpty(); + + StatisticsEvent checkpoint1Subtask0DataStatisticEvent = + Fixtures.createStatisticsEvent( + StatisticsType.Sketch, Fixtures.TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("a")); + StatisticsEvent checkpoint1Subtask1DataStatisticEvent = + Fixtures.createStatisticsEvent( + StatisticsType.Sketch, Fixtures.TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("b")); + // Handle events from operators for checkpoint 1 + dataStatisticsCoordinator.handleEventFromOperator( + 0, 0, checkpoint1Subtask0DataStatisticEvent); + dataStatisticsCoordinator.handleEventFromOperator( + 1, 0, checkpoint1Subtask1DataStatisticEvent); + + waitForCoordinatorToProcessActions(dataStatisticsCoordinator); + Awaitility.await("wait for statistics event") + .pollInterval(Duration.ofMillis(10)) + .atMost(Duration.ofSeconds(10)) + .until(() -> receivingTasks.getSentEventsForSubtask(0).size() == 1); + assertThat(receivingTasks.getSentEventsForSubtask(0).get(0)) + .isInstanceOf(StatisticsEvent.class); + + Awaitility.await("wait for statistics event") + .pollInterval(Duration.ofMillis(10)) + .atMost(Duration.ofSeconds(10)) + .until(() -> receivingTasks.getSentEventsForSubtask(1).size() == 1); + assertThat(receivingTasks.getSentEventsForSubtask(1).get(0)) + .isInstanceOf(StatisticsEvent.class); + + dataStatisticsCoordinator.handleEventFromOperator(1, 0, new RequestGlobalStatisticsEvent()); + + // coordinator should send a response to subtask 1 + Awaitility.await("wait for statistics event") + .pollInterval(Duration.ofMillis(10)) + .atMost(Duration.ofSeconds(10)) + .until(() -> receivingTasks.getSentEventsForSubtask(1).size() == 2); + assertThat(receivingTasks.getSentEventsForSubtask(1).get(0)) + .isInstanceOf(StatisticsEvent.class); + assertThat(receivingTasks.getSentEventsForSubtask(1).get(1)) + .isInstanceOf(StatisticsEvent.class); + } + } + + @Test + public void testMultipleRequestGlobalStatisticsEvents() throws Exception { + try (DataStatisticsCoordinator dataStatisticsCoordinator = + createCoordinator(StatisticsType.Map)) { + dataStatisticsCoordinator.start(); + tasksReady(dataStatisticsCoordinator); + + StatisticsEvent checkpoint1Subtask0DataStatisticEvent = + Fixtures.createStatisticsEvent( + StatisticsType.Map, Fixtures.TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("a")); + StatisticsEvent checkpoint1Subtask1DataStatisticEvent = + Fixtures.createStatisticsEvent( + StatisticsType.Map, Fixtures.TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("b")); + + dataStatisticsCoordinator.handleEventFromOperator( + 0, 0, checkpoint1Subtask0DataStatisticEvent); + dataStatisticsCoordinator.handleEventFromOperator( + 1, 0, checkpoint1Subtask1DataStatisticEvent); + + waitForCoordinatorToProcessActions(dataStatisticsCoordinator); + + // signature is null + dataStatisticsCoordinator.handleEventFromOperator(0, 0, new RequestGlobalStatisticsEvent()); + + // Checkpoint StatisticEvent + RequestGlobalStatisticsEvent + Awaitility.await("wait for first statistics event") + .pollInterval(Duration.ofMillis(10)) + .atMost(Duration.ofSeconds(10)) + .until(() -> receivingTasks.getSentEventsForSubtask(0).size() == 2); + + // Simulate the scenario where a subtask send global statistics request with the same hash + // code. The coordinator would skip the response after comparing the request contained hash + // code with latest global statistics hash code. + int correctSignature = dataStatisticsCoordinator.globalStatistics().hashCode(); + dataStatisticsCoordinator.handleEventFromOperator( + 0, 0, new RequestGlobalStatisticsEvent(correctSignature)); + + waitForCoordinatorToProcessActions(dataStatisticsCoordinator); + // Checkpoint StatisticEvent + RequestGlobalStatisticsEvent + assertThat(receivingTasks.getSentEventsForSubtask(0).size()).isEqualTo(2); + + // signature is different + dataStatisticsCoordinator.handleEventFromOperator( + 0, 0, new RequestGlobalStatisticsEvent(correctSignature + 1)); + + // Checkpoint StatisticEvent + RequestGlobalStatisticsEvent + RequestGlobalStatisticsEvent + Awaitility.await("wait for second statistics event") + .pollInterval(Duration.ofMillis(10)) + .atMost(Duration.ofSeconds(10)) + .until(() -> receivingTasks.getSentEventsForSubtask(0).size() == 3); + } + } + + static void setAllTasksReady( + int subtasks, + DataStatisticsCoordinator dataStatisticsCoordinator, + EventReceivingTasks receivingTasks) { + for (int i = 0; i < subtasks; i++) { + dataStatisticsCoordinator.executionAttemptReady( + i, 0, receivingTasks.createGatewayForSubtask(i, 0)); + } + } + + static void waitForCoordinatorToProcessActions(DataStatisticsCoordinator coordinator) { + CompletableFuture future = new CompletableFuture<>(); + coordinator.callInCoordinatorThread( + () -> { + future.complete(null); + return null; + }, + "Coordinator fails to process action"); + + try { + future.get(); + } catch (InterruptedException e) { + throw new AssertionError("test interrupted"); + } catch (ExecutionException e) { + ExceptionUtils.rethrow(ExceptionUtils.stripExecutionException(e)); + } + } + + private static DataStatisticsCoordinator createCoordinator(StatisticsType type) { + return new DataStatisticsCoordinator( + OPERATOR_NAME, + new MockOperatorCoordinatorContext(TEST_OPERATOR_ID, NUM_SUBTASKS), + Fixtures.SCHEMA, + Fixtures.SORT_ORDER, + NUM_SUBTASKS, + type, + 0.0d); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java new file mode 100644 index 000000000000..6317f2bfde18 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.TASK_STATISTICS_SERIALIZER; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.createStatisticsEvent; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.operators.coordination.EventReceivingTasks; +import org.apache.flink.runtime.operators.coordination.MockOperatorCoordinatorContext; +import org.apache.flink.runtime.operators.coordination.RecreateOnResetOperatorCoordinator; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +public class TestDataStatisticsCoordinatorProvider { + private static final OperatorID OPERATOR_ID = new OperatorID(); + + private EventReceivingTasks receivingTasks; + + @BeforeEach + public void before() { + receivingTasks = EventReceivingTasks.createForRunningTasks(); + } + + @ParameterizedTest + @EnumSource(StatisticsType.class) + public void testCheckpointAndReset(StatisticsType type) throws Exception { + DataStatisticsCoordinatorProvider provider = createProvider(type, Fixtures.NUM_SUBTASKS); + try (RecreateOnResetOperatorCoordinator coordinator = + (RecreateOnResetOperatorCoordinator) + provider.create( + new MockOperatorCoordinatorContext(OPERATOR_ID, Fixtures.NUM_SUBTASKS))) { + DataStatisticsCoordinator dataStatisticsCoordinator = + (DataStatisticsCoordinator) coordinator.getInternalCoordinator(); + + // Start the coordinator + coordinator.start(); + TestDataStatisticsCoordinator.setAllTasksReady( + Fixtures.NUM_SUBTASKS, dataStatisticsCoordinator, receivingTasks); + + // Handle events from operators for checkpoint 1 + StatisticsEvent checkpoint1Subtask0StatisticsEvent = + createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("a")); + coordinator.handleEventFromOperator(0, 0, checkpoint1Subtask0StatisticsEvent); + TestDataStatisticsCoordinator.waitForCoordinatorToProcessActions(dataStatisticsCoordinator); + + StatisticsEvent checkpoint1Subtask1StatisticsEvent = + createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("b")); + coordinator.handleEventFromOperator(1, 0, checkpoint1Subtask1StatisticsEvent); + TestDataStatisticsCoordinator.waitForCoordinatorToProcessActions(dataStatisticsCoordinator); + + // Verify checkpoint 1 global data statistics + Map checkpoint1KeyFrequency = + ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 1L); + MapAssignment checkpoint1MapAssignment = + MapAssignment.fromKeyFrequency( + Fixtures.NUM_SUBTASKS, checkpoint1KeyFrequency, 0.0d, SORT_ORDER_COMPARTOR); + + CompletedStatistics completedStatistics = dataStatisticsCoordinator.completedStatistics(); + assertThat(completedStatistics).isNotNull(); + assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(completedStatistics.keyFrequency()).isEqualTo(checkpoint1KeyFrequency); + } else { + assertThat(completedStatistics.keySamples()) + .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); + } + + GlobalStatistics globalStatistics = dataStatisticsCoordinator.globalStatistics(); + assertThat(globalStatistics).isNotNull(); + assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(globalStatistics.mapAssignment()).isEqualTo(checkpoint1MapAssignment); + } else { + assertThat(globalStatistics.rangeBounds()).containsExactly(CHAR_KEYS.get("a")); + } + + byte[] checkpoint1Bytes = waitForCheckpoint(1L, dataStatisticsCoordinator); + + StatisticsEvent checkpoint2Subtask0StatisticsEvent = + createStatisticsEvent( + type, TASK_STATISTICS_SERIALIZER, 2L, CHAR_KEYS.get("d"), CHAR_KEYS.get("e")); + coordinator.handleEventFromOperator(0, 0, checkpoint2Subtask0StatisticsEvent); + TestDataStatisticsCoordinator.waitForCoordinatorToProcessActions(dataStatisticsCoordinator); + + StatisticsEvent checkpoint2Subtask1StatisticsEvent = + createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 2L, CHAR_KEYS.get("f")); + coordinator.handleEventFromOperator(1, 0, checkpoint2Subtask1StatisticsEvent); + TestDataStatisticsCoordinator.waitForCoordinatorToProcessActions(dataStatisticsCoordinator); + + // Verify checkpoint 2 global data statistics + Map checkpoint2KeyFrequency = + ImmutableMap.of(CHAR_KEYS.get("d"), 1L, CHAR_KEYS.get("e"), 1L, CHAR_KEYS.get("f"), 1L); + MapAssignment checkpoint2MapAssignment = + MapAssignment.fromKeyFrequency( + Fixtures.NUM_SUBTASKS, checkpoint2KeyFrequency, 0.0d, SORT_ORDER_COMPARTOR); + completedStatistics = dataStatisticsCoordinator.completedStatistics(); + assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(completedStatistics.keyFrequency()).isEqualTo(checkpoint2KeyFrequency); + } else { + assertThat(completedStatistics.keySamples()) + .containsExactly(CHAR_KEYS.get("d"), CHAR_KEYS.get("e"), CHAR_KEYS.get("f")); + } + + globalStatistics = dataStatisticsCoordinator.globalStatistics(); + assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(globalStatistics.mapAssignment()).isEqualTo(checkpoint2MapAssignment); + } else { + assertThat(globalStatistics.rangeBounds()).containsExactly(CHAR_KEYS.get("e")); + } + + waitForCheckpoint(2L, dataStatisticsCoordinator); + + // Reset coordinator to checkpoint 1 + coordinator.resetToCheckpoint(1L, checkpoint1Bytes); + DataStatisticsCoordinator restoredDataStatisticsCoordinator = + (DataStatisticsCoordinator) coordinator.getInternalCoordinator(); + assertThat(dataStatisticsCoordinator).isNotSameAs(restoredDataStatisticsCoordinator); + + completedStatistics = restoredDataStatisticsCoordinator.completedStatistics(); + assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + // Verify restored data statistics + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(completedStatistics.keyFrequency()) + .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 1L)); + } else { + assertThat(completedStatistics.keySamples()) + .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); + } + + globalStatistics = restoredDataStatisticsCoordinator.globalStatistics(); + assertThat(globalStatistics).isNotNull(); + assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(globalStatistics.mapAssignment()).isEqualTo(checkpoint1MapAssignment); + } else { + assertThat(globalStatistics.rangeBounds()).containsExactly(CHAR_KEYS.get("a")); + } + } + } + + private byte[] waitForCheckpoint(long checkpointId, DataStatisticsCoordinator coordinator) + throws InterruptedException, ExecutionException { + CompletableFuture future = new CompletableFuture<>(); + coordinator.checkpointCoordinator(checkpointId, future); + return future.get(); + } + + private static DataStatisticsCoordinatorProvider createProvider( + StatisticsType type, int downstreamParallelism) { + return new DataStatisticsCoordinatorProvider( + "DataStatisticsCoordinatorProvider", + OPERATOR_ID, + Fixtures.SCHEMA, + Fixtures.SORT_ORDER, + downstreamParallelism, + type, + 0.0); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java new file mode 100644 index 000000000000..09b2b6371e8b --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java @@ -0,0 +1,374 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.verify; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.apache.datasketches.sampling.ReservoirItemsSketch; +import org.apache.flink.api.common.ExecutionConfig; +import org.apache.flink.api.common.state.OperatorStateStore; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.CloseableRegistry; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; +import org.apache.flink.runtime.execution.Environment; +import org.apache.flink.runtime.operators.coordination.MockOperatorEventGateway; +import org.apache.flink.runtime.operators.testutils.MockInputSplitProvider; +import org.apache.flink.runtime.state.AbstractStateBackend; +import org.apache.flink.runtime.state.OperatorStateBackendParametersImpl; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.apache.flink.runtime.state.StateInitializationContextImpl; +import org.apache.flink.runtime.state.TestTaskStateManager; +import org.apache.flink.runtime.state.hashmap.HashMapStateBackend; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.runtime.tasks.StreamMockEnvironment; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.MethodSource; +import org.mockito.Mockito; + +public class TestDataStatisticsOperator { + + private Environment env; + + @BeforeEach + public void before() throws Exception { + this.env = + new StreamMockEnvironment( + new Configuration(), + new Configuration(), + new ExecutionConfig(), + 1L, + new MockInputSplitProvider(), + 1, + new TestTaskStateManager()); + } + + private DataStatisticsOperator createOperator(StatisticsType type, int downstreamParallelism) + throws Exception { + MockOperatorEventGateway mockGateway = new MockOperatorEventGateway(); + return createOperator(type, downstreamParallelism, mockGateway); + } + + private DataStatisticsOperator createOperator( + StatisticsType type, int downstreamParallelism, MockOperatorEventGateway mockGateway) { + DataStatisticsOperator operator = + new DataStatisticsOperator( + null, + "testOperator", + Fixtures.SCHEMA, + Fixtures.SORT_ORDER, + mockGateway, + downstreamParallelism, + type); + return operator; + } + + @SuppressWarnings("unchecked") + @ParameterizedTest + @EnumSource(StatisticsType.class) + public void testProcessElement(StatisticsType type) throws Exception { + DataStatisticsOperator operator = createOperator(type, Fixtures.NUM_SUBTASKS); + try (OneInputStreamOperatorTestHarness testHarness = + createHarness(operator)) { + StateInitializationContext stateContext = getStateContext(); + operator.initializeState(stateContext); + operator.processElement(new StreamRecord<>(GenericRowData.of(StringData.fromString("a"), 5))); + operator.processElement(new StreamRecord<>(GenericRowData.of(StringData.fromString("a"), 3))); + operator.processElement(new StreamRecord<>(GenericRowData.of(StringData.fromString("b"), 1))); + + DataStatistics localStatistics = operator.localStatistics(); + assertThat(localStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + Map keyFrequency = (Map) localStatistics.result(); + assertThat(keyFrequency) + .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 2L, CHAR_KEYS.get("b"), 1L)); + } else { + ReservoirItemsSketch sketch = + (ReservoirItemsSketch) localStatistics.result(); + assertThat(sketch.getSamples()) + .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); + } + + testHarness.endInput(); + } + } + + @ParameterizedTest + @EnumSource(StatisticsType.class) + public void testProcessElementWithNull(StatisticsType type) throws Exception { + DataStatisticsOperator operator = createOperator(type, Fixtures.NUM_SUBTASKS); + try (OneInputStreamOperatorTestHarness testHarness = + createHarness(operator)) { + StateInitializationContext stateContext = getStateContext(); + operator.initializeState(stateContext); + operator.processElement(new StreamRecord<>(GenericRowData.of(null, 5))); + operator.processElement(new StreamRecord<>(GenericRowData.of(StringData.fromString("a"), 3))); + + DataStatistics localStatistics = operator.localStatistics(); + SortKeySerializer sortKeySerializer = + new SortKeySerializer(Fixtures.SCHEMA, Fixtures.SORT_ORDER); + DataStatisticsSerializer taskStatisticsSerializer = + new DataStatisticsSerializer(sortKeySerializer); + DataOutputSerializer outputView = new DataOutputSerializer(1024); + + taskStatisticsSerializer.serialize(localStatistics, outputView); + DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); + DataStatistics dataStatistics = taskStatisticsSerializer.deserialize(inputView); + + testHarness.endInput(); + + assertThat(localStatistics).isEqualTo(dataStatistics); + } + } + + @ParameterizedTest + @EnumSource(StatisticsType.class) + public void testOperatorOutput(StatisticsType type) throws Exception { + DataStatisticsOperator operator = createOperator(type, Fixtures.NUM_SUBTASKS); + try (OneInputStreamOperatorTestHarness testHarness = + createHarness(operator)) { + testHarness.processElement( + new StreamRecord<>(GenericRowData.of(StringData.fromString("a"), 2))); + testHarness.processElement( + new StreamRecord<>(GenericRowData.of(StringData.fromString("b"), 3))); + testHarness.processElement( + new StreamRecord<>(GenericRowData.of(StringData.fromString("b"), 1))); + + List recordsOutput = + testHarness.extractOutputValues().stream() + .filter(StatisticsOrRecord::hasRecord) + .map(StatisticsOrRecord::record) + .collect(Collectors.toList()); + assertThat(recordsOutput) + .containsExactlyInAnyOrderElementsOf( + ImmutableList.of( + GenericRowData.of(StringData.fromString("a"), 2), + GenericRowData.of(StringData.fromString("b"), 3), + GenericRowData.of(StringData.fromString("b"), 1))); + } + } + + private static Stream provideRestoreStateParameters() { + return Stream.of( + Arguments.of(StatisticsType.Map, -1), + Arguments.of(StatisticsType.Map, 0), + Arguments.of(StatisticsType.Map, 1), + Arguments.of(StatisticsType.Sketch, -1), + Arguments.of(StatisticsType.Sketch, 0), + Arguments.of(StatisticsType.Sketch, 1)); + } + + @ParameterizedTest + @MethodSource("provideRestoreStateParameters") + public void testRestoreState(StatisticsType type, int parallelismAdjustment) throws Exception { + Map keyFrequency = + ImmutableMap.of(CHAR_KEYS.get("a"), 2L, CHAR_KEYS.get("b"), 1L, CHAR_KEYS.get("c"), 1L); + SortKey[] rangeBounds = new SortKey[] {CHAR_KEYS.get("a")}; + MapAssignment mapAssignment = + MapAssignment.fromKeyFrequency(2, keyFrequency, 0.0d, SORT_ORDER_COMPARTOR); + DataStatisticsOperator operator = createOperator(type, Fixtures.NUM_SUBTASKS); + OperatorSubtaskState snapshot; + try (OneInputStreamOperatorTestHarness testHarness1 = + createHarness(operator)) { + GlobalStatistics statistics; + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + statistics = GlobalStatistics.fromMapAssignment(1L, mapAssignment); + } else { + statistics = GlobalStatistics.fromRangeBounds(1L, rangeBounds); + } + + StatisticsEvent event = + StatisticsEvent.createGlobalStatisticsEvent( + statistics, Fixtures.GLOBAL_STATISTICS_SERIALIZER, false); + operator.handleOperatorEvent(event); + + GlobalStatistics globalStatistics = operator.globalStatistics(); + assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(globalStatistics.mapAssignment()).isEqualTo(mapAssignment); + assertThat(globalStatistics.rangeBounds()).isNull(); + } else { + assertThat(globalStatistics.mapAssignment()).isNull(); + assertThat(globalStatistics.rangeBounds()).isEqualTo(rangeBounds); + } + + snapshot = testHarness1.snapshot(1L, 0); + } + + // Use the snapshot to initialize state for another new operator and then verify that the global + // statistics for the new operator is same as before + MockOperatorEventGateway spyGateway = Mockito.spy(new MockOperatorEventGateway()); + DataStatisticsOperator restoredOperator = + createOperator(type, Fixtures.NUM_SUBTASKS + parallelismAdjustment, spyGateway); + try (OneInputStreamOperatorTestHarness testHarness2 = + new OneInputStreamOperatorTestHarness<>(restoredOperator, 2, 2, 1)) { + testHarness2.setup(); + testHarness2.initializeState(snapshot); + + GlobalStatistics globalStatistics = restoredOperator.globalStatistics(); + // global statistics is always restored and used initially even if + // downstream parallelism changed. + assertThat(globalStatistics).isNotNull(); + // request is always sent to coordinator during initialization. + // coordinator would respond with a new global statistics that + // has range bound recomputed with new parallelism. + verify(spyGateway).sendEventToCoordinator(any(RequestGlobalStatisticsEvent.class)); + assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(globalStatistics.mapAssignment()).isEqualTo(mapAssignment); + assertThat(globalStatistics.rangeBounds()).isNull(); + } else { + assertThat(globalStatistics.mapAssignment()).isNull(); + assertThat(globalStatistics.rangeBounds()).isEqualTo(rangeBounds); + } + } + } + + @SuppressWarnings("unchecked") + @Test + public void testMigrationWithLocalStatsOverThreshold() throws Exception { + DataStatisticsOperator operator = createOperator(StatisticsType.Auto, Fixtures.NUM_SUBTASKS); + try (OneInputStreamOperatorTestHarness testHarness = + createHarness(operator)) { + StateInitializationContext stateContext = getStateContext(); + operator.initializeState(stateContext); + + // add rows with unique keys + for (int i = 0; i < SketchUtil.OPERATOR_SKETCH_SWITCH_THRESHOLD; ++i) { + operator.processElement( + new StreamRecord<>(GenericRowData.of(StringData.fromString(String.valueOf(i)), i))); + assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Map); + assertThat((Map) operator.localStatistics().result()).hasSize(i + 1); + } + + // one more item should trigger the migration to sketch stats + operator.processElement( + new StreamRecord<>(GenericRowData.of(StringData.fromString("key-trigger-migration"), 1))); + + int reservoirSize = + SketchUtil.determineOperatorReservoirSize(Fixtures.NUM_SUBTASKS, Fixtures.NUM_SUBTASKS); + + assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Sketch); + ReservoirItemsSketch sketch = + (ReservoirItemsSketch) operator.localStatistics().result(); + assertThat(sketch.getK()).isEqualTo(reservoirSize); + assertThat(sketch.getN()).isEqualTo(SketchUtil.OPERATOR_SKETCH_SWITCH_THRESHOLD + 1); + // reservoir not full yet + assertThat(sketch.getN()).isLessThan(reservoirSize); + assertThat(sketch.getSamples()).hasSize((int) sketch.getN()); + + // add more items to saturate the reservoir + for (int i = 0; i < reservoirSize; ++i) { + operator.processElement( + new StreamRecord<>(GenericRowData.of(StringData.fromString(String.valueOf(i)), i))); + } + + assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Sketch); + sketch = (ReservoirItemsSketch) operator.localStatistics().result(); + assertThat(sketch.getK()).isEqualTo(reservoirSize); + assertThat(sketch.getN()) + .isEqualTo(SketchUtil.OPERATOR_SKETCH_SWITCH_THRESHOLD + 1 + reservoirSize); + // reservoir is full now + assertThat(sketch.getN()).isGreaterThan(reservoirSize); + assertThat(sketch.getSamples()).hasSize(reservoirSize); + + testHarness.endInput(); + } + } + + @SuppressWarnings("unchecked") + @Test + public void testMigrationWithGlobalSketchStatistics() throws Exception { + DataStatisticsOperator operator = createOperator(StatisticsType.Auto, Fixtures.NUM_SUBTASKS); + try (OneInputStreamOperatorTestHarness testHarness = + createHarness(operator)) { + StateInitializationContext stateContext = getStateContext(); + operator.initializeState(stateContext); + + // started with Map stype + operator.processElement(new StreamRecord<>(GenericRowData.of(StringData.fromString("a"), 1))); + assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Map); + assertThat((Map) operator.localStatistics().result()) + .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L)); + + // received global statistics with sketch type + GlobalStatistics globalStatistics = + GlobalStatistics.fromRangeBounds( + 1L, new SortKey[] {CHAR_KEYS.get("c"), CHAR_KEYS.get("f")}); + operator.handleOperatorEvent( + StatisticsEvent.createGlobalStatisticsEvent( + globalStatistics, Fixtures.GLOBAL_STATISTICS_SERIALIZER, false)); + + int reservoirSize = + SketchUtil.determineOperatorReservoirSize(Fixtures.NUM_SUBTASKS, Fixtures.NUM_SUBTASKS); + + assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Sketch); + ReservoirItemsSketch sketch = + (ReservoirItemsSketch) operator.localStatistics().result(); + assertThat(sketch.getK()).isEqualTo(reservoirSize); + assertThat(sketch.getN()).isEqualTo(1); + assertThat(sketch.getSamples()).isEqualTo(new SortKey[] {CHAR_KEYS.get("a")}); + + testHarness.endInput(); + } + } + + private StateInitializationContext getStateContext() throws Exception { + AbstractStateBackend abstractStateBackend = new HashMapStateBackend(); + CloseableRegistry cancelStreamRegistry = new CloseableRegistry(); + OperatorStateStore operatorStateStore = + abstractStateBackend.createOperatorStateBackend( + new OperatorStateBackendParametersImpl( + env, "test-operator", Collections.emptyList(), cancelStreamRegistry)); + return new StateInitializationContextImpl(null, operatorStateStore, null, null, null); + } + + private OneInputStreamOperatorTestHarness createHarness( + DataStatisticsOperator dataStatisticsOperator) throws Exception { + OneInputStreamOperatorTestHarness harness = + new OneInputStreamOperatorTestHarness<>( + dataStatisticsOperator, Fixtures.NUM_SUBTASKS, Fixtures.NUM_SUBTASKS, 0); + harness.setup( + new StatisticsOrRecordSerializer( + Fixtures.GLOBAL_STATISTICS_SERIALIZER, Fixtures.ROW_SERIALIZER)); + harness.open(); + return harness; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java new file mode 100644 index 000000000000..59ce6df05d9d --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; + +import org.apache.flink.api.common.typeutils.SerializerTestBase; +import org.apache.flink.api.common.typeutils.TypeSerializer; + +public class TestDataStatisticsSerializer extends SerializerTestBase { + @Override + protected TypeSerializer createSerializer() { + return Fixtures.TASK_STATISTICS_SERIALIZER; + } + + @Override + protected int getLength() { + return -1; + } + + @Override + protected Class getTypeClass() { + return DataStatistics.class; + } + + @Override + protected DataStatistics[] getTestData() { + return new DataStatistics[] { + new MapDataStatistics(), + Fixtures.createTaskStatistics( + StatisticsType.Map, CHAR_KEYS.get("a"), CHAR_KEYS.get("a"), CHAR_KEYS.get("b")), + new SketchDataStatistics(128), + Fixtures.createTaskStatistics( + StatisticsType.Sketch, CHAR_KEYS.get("a"), CHAR_KEYS.get("a"), CHAR_KEYS.get("b")) + }; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java new file mode 100644 index 000000000000..7afaf239c668 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; + +import org.apache.flink.api.common.typeutils.SerializerTestBase; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; + +public class TestGlobalStatisticsSerializer extends SerializerTestBase { + + @Override + protected TypeSerializer createSerializer() { + return Fixtures.GLOBAL_STATISTICS_SERIALIZER; + } + + @Override + protected int getLength() { + return -1; + } + + @Override + protected Class getTypeClass() { + return GlobalStatistics.class; + } + + @Override + protected GlobalStatistics[] getTestData() { + return new GlobalStatistics[] { + GlobalStatistics.fromMapAssignment( + 1L, + MapAssignment.fromKeyFrequency( + Fixtures.NUM_SUBTASKS, + ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L), + 0.0d, + SORT_ORDER_COMPARTOR)), + GlobalStatistics.fromRangeBounds(2L, new SortKey[] {CHAR_KEYS.get("a"), CHAR_KEYS.get("b")}) + }; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java new file mode 100644 index 000000000000..8a25c7ad9898 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.ROW_WRAPPER; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Map; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.Test; + +public class TestMapDataStatistics { + @SuppressWarnings("unchecked") + @Test + public void testAddsAndGet() { + MapDataStatistics dataStatistics = new MapDataStatistics(); + + GenericRowData reusedRow = GenericRowData.of(StringData.fromString("a"), 1); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + reusedRow.setField(0, StringData.fromString("b")); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + reusedRow.setField(0, StringData.fromString("c")); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + reusedRow.setField(0, StringData.fromString("b")); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + reusedRow.setField(0, StringData.fromString("a")); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + reusedRow.setField(0, StringData.fromString("b")); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + Map actual = (Map) dataStatistics.result(); + Map expected = + ImmutableMap.of(CHAR_KEYS.get("a"), 2L, CHAR_KEYS.get("b"), 3L, CHAR_KEYS.get("c"), 1L); + assertThat(actual).isEqualTo(expected); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java new file mode 100644 index 000000000000..a59ed3b1c77b --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java @@ -0,0 +1,436 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.util.Pair; +import org.junit.jupiter.api.Test; + +public class TestMapRangePartitioner { + private static final SortOrder SORT_ORDER = + SortOrder.builderFor(TestFixtures.SCHEMA).asc("data").build(); + + private static final SortKey SORT_KEY = new SortKey(TestFixtures.SCHEMA, SORT_ORDER); + private static final RowType ROW_TYPE = FlinkSchemaUtil.convert(TestFixtures.SCHEMA); + private static final SortKey[] SORT_KEYS = initSortKeys(); + + private static SortKey[] initSortKeys() { + SortKey[] sortKeys = new SortKey[10]; + for (int i = 0; i < 10; ++i) { + RowData rowData = + GenericRowData.of(StringData.fromString("k" + i), i, StringData.fromString("2023-06-20")); + RowDataWrapper keyWrapper = new RowDataWrapper(ROW_TYPE, TestFixtures.SCHEMA.asStruct()); + keyWrapper.wrap(rowData); + SortKey sortKey = SORT_KEY.copy(); + sortKey.wrap(keyWrapper); + sortKeys[i] = sortKey; + } + return sortKeys; + } + + // Total weight is 800 + private final Map mapStatistics = + ImmutableMap.of( + SORT_KEYS[0], + 350L, + SORT_KEYS[1], + 230L, + SORT_KEYS[2], + 120L, + SORT_KEYS[3], + 40L, + SORT_KEYS[4], + 10L, + SORT_KEYS[5], + 10L, + SORT_KEYS[6], + 10L, + SORT_KEYS[7], + 10L, + SORT_KEYS[8], + 10L, + SORT_KEYS[9], + 10L); + + @Test + public void testEvenlyDividableNoClosingFileCost() { + int numPartitions = 8; + MapAssignment mapAssignment = + MapAssignment.fromKeyFrequency(numPartitions, mapStatistics, 0.0, SORT_ORDER_COMPARTOR); + + // each task should get targeted weight of 100 (=800/8) + Map expectedAssignment = + ImmutableMap.of( + SORT_KEYS[0], + new KeyAssignment( + ImmutableList.of(0, 1, 2, 3), ImmutableList.of(100L, 100L, 100L, 50L), 0L), + SORT_KEYS[1], + new KeyAssignment(ImmutableList.of(3, 4, 5), ImmutableList.of(50L, 100L, 80L), 0L), + SORT_KEYS[2], + new KeyAssignment(ImmutableList.of(5, 6), ImmutableList.of(20L, 100L), 0L), + SORT_KEYS[3], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(40L), 0L), + SORT_KEYS[4], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), + SORT_KEYS[5], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), + SORT_KEYS[6], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), + SORT_KEYS[7], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), + SORT_KEYS[8], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), + SORT_KEYS[9], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L)); + assertThat(mapAssignment).isEqualTo(new MapAssignment(numPartitions, expectedAssignment)); + + // key: subtask id + // value pair: first is the assigned weight, second is the number of assigned keys + Map> expectedAssignmentInfo = + ImmutableMap.of( + 0, + Pair.of(100L, 1), + 1, + Pair.of(100L, 1), + 2, + Pair.of(100L, 1), + 3, + Pair.of(100L, 2), + 4, + Pair.of(100L, 1), + 5, + Pair.of(100L, 2), + 6, + Pair.of(100L, 1), + 7, + Pair.of(100L, 7)); + assertThat(mapAssignment.assignmentInfo()).isEqualTo(expectedAssignmentInfo); + + MapRangePartitioner partitioner = + new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapAssignment); + Map>> partitionResults = + runPartitioner(partitioner, numPartitions, mapStatistics); + validatePartitionResults(expectedAssignmentInfo, partitionResults, 5.0); + } + + @Test + public void testEvenlyDividableWithClosingFileCost() { + int numPartitions = 8; + MapAssignment mapAssignment = + MapAssignment.fromKeyFrequency(numPartitions, mapStatistics, 5.0, SORT_ORDER_COMPARTOR); + + // target subtask weight is 100 before close file cost factored in. + // close file cost is 5 = 5% * 100. + // key weights before and after close file cost factored in + // before: 350, 230, 120, 40, 10, 10, 10, 10, 10, 10 + // close-cost: 20, 15, 10, 5, 5, 5, 5, 5, 5, 5 + // after: 370, 245, 130, 45, 15, 15, 15, 15, 15, 15 + // target subtask weight with close cost per subtask is 110 (880/8) + Map expectedAssignment = + ImmutableMap.of( + SORT_KEYS[0], + new KeyAssignment( + ImmutableList.of(0, 1, 2, 3), ImmutableList.of(110L, 110L, 110L, 40L), 5L), + SORT_KEYS[1], + new KeyAssignment(ImmutableList.of(3, 4, 5), ImmutableList.of(70L, 110L, 65L), 5L), + SORT_KEYS[2], + new KeyAssignment(ImmutableList.of(5, 6), ImmutableList.of(45L, 85L), 5L), + SORT_KEYS[3], + new KeyAssignment(ImmutableList.of(6, 7), ImmutableList.of(25L, 20L), 5L), + SORT_KEYS[4], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), + SORT_KEYS[5], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), + SORT_KEYS[6], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), + SORT_KEYS[7], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), + SORT_KEYS[8], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), + SORT_KEYS[9], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L)); + assertThat(mapAssignment.keyAssignments()).isEqualTo(expectedAssignment); + + // key: subtask id + // value pair: first is the assigned weight (excluding close file cost) for the subtask, + // second is the number of keys assigned to the subtask + Map> expectedAssignmentInfo = + ImmutableMap.of( + 0, + Pair.of(105L, 1), + 1, + Pair.of(105L, 1), + 2, + Pair.of(105L, 1), + 3, + Pair.of(100L, 2), + 4, + Pair.of(105L, 1), + 5, + Pair.of(100L, 2), + 6, + Pair.of(100L, 2), + 7, + Pair.of(75L, 7)); + assertThat(mapAssignment.assignmentInfo()).isEqualTo(expectedAssignmentInfo); + + MapRangePartitioner partitioner = + new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapAssignment); + Map>> partitionResults = + runPartitioner(partitioner, numPartitions, mapStatistics); + validatePartitionResults(expectedAssignmentInfo, partitionResults, 5.0); + } + + @Test + public void testNonDividableNoClosingFileCost() { + int numPartitions = 9; + MapAssignment mapAssignment = + MapAssignment.fromKeyFrequency(numPartitions, mapStatistics, 0.0, SORT_ORDER_COMPARTOR); + + // before: 350, 230, 120, 40, 10, 10, 10, 10, 10, 10 + // each task should get targeted weight of 89 = ceiling(800/9) + Map expectedAssignment = + ImmutableMap.of( + SORT_KEYS[0], + new KeyAssignment( + ImmutableList.of(0, 1, 2, 3), ImmutableList.of(89L, 89L, 89L, 83L), 0L), + SORT_KEYS[1], + new KeyAssignment( + ImmutableList.of(3, 4, 5, 6), ImmutableList.of(6L, 89L, 89L, 46L), 0L), + SORT_KEYS[2], + new KeyAssignment(ImmutableList.of(6, 7), ImmutableList.of(43L, 77L), 0L), + SORT_KEYS[3], + new KeyAssignment(ImmutableList.of(7, 8), ImmutableList.of(12L, 28L), 0L), + SORT_KEYS[4], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), + SORT_KEYS[5], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), + SORT_KEYS[6], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), + SORT_KEYS[7], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), + SORT_KEYS[8], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), + SORT_KEYS[9], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L)); + assertThat(mapAssignment.keyAssignments()).isEqualTo(expectedAssignment); + + // key: subtask id + // value pair: first is the assigned weight, second is the number of assigned keys + Map> expectedAssignmentInfo = + ImmutableMap.of( + 0, + Pair.of(89L, 1), + 1, + Pair.of(89L, 1), + 2, + Pair.of(89L, 1), + 3, + Pair.of(89L, 2), + 4, + Pair.of(89L, 1), + 5, + Pair.of(89L, 1), + 6, + Pair.of(89L, 2), + 7, + Pair.of(89L, 2), + 8, + Pair.of(88L, 7)); + assertThat(mapAssignment.assignmentInfo()).isEqualTo(expectedAssignmentInfo); + + MapRangePartitioner partitioner = + new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapAssignment); + Map>> partitionResults = + runPartitioner(partitioner, numPartitions, mapStatistics); + validatePartitionResults(expectedAssignmentInfo, partitionResults, 5.0); + } + + @Test + public void testNonDividableWithClosingFileCost() { + int numPartitions = 9; + MapAssignment mapAssignment = + MapAssignment.fromKeyFrequency(numPartitions, mapStatistics, 5.0, SORT_ORDER_COMPARTOR); + + // target subtask weight is 89 before close file cost factored in. + // close file cost is 5 (= 5% * 89) per file. + // key weights before and after close file cost factored in + // before: 350, 230, 120, 40, 10, 10, 10, 10, 10, 10 + // close-cost: 20, 15, 10, 5, 5, 5, 5, 5, 5, 5 + // after: 370, 245, 130, 45, 15, 15, 15, 15, 15, 15 + // target subtask weight per subtask is 98 ceiling(880/9) + Map expectedAssignment = + ImmutableMap.of( + SORT_KEYS[0], + new KeyAssignment( + ImmutableList.of(0, 1, 2, 3), ImmutableList.of(98L, 98L, 98L, 76L), 5L), + SORT_KEYS[1], + new KeyAssignment( + ImmutableList.of(3, 4, 5, 6), ImmutableList.of(22L, 98L, 98L, 27L), 5L), + SORT_KEYS[2], + new KeyAssignment(ImmutableList.of(6, 7), ImmutableList.of(71L, 59L), 5L), + SORT_KEYS[3], + new KeyAssignment(ImmutableList.of(7, 8), ImmutableList.of(39L, 6L), 5L), + SORT_KEYS[4], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), + SORT_KEYS[5], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), + SORT_KEYS[6], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), + SORT_KEYS[7], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), + SORT_KEYS[8], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), + SORT_KEYS[9], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L)); + assertThat(mapAssignment.keyAssignments()).isEqualTo(expectedAssignment); + + // key: subtask id + // value pair: first is the assigned weight for the subtask, second is the number of keys + // assigned to the subtask + Map> expectedAssignmentInfo = + ImmutableMap.of( + 0, + Pair.of(93L, 1), + 1, + Pair.of(93L, 1), + 2, + Pair.of(93L, 1), + 3, + Pair.of(88L, 2), + 4, + Pair.of(93L, 1), + 5, + Pair.of(93L, 1), + 6, + Pair.of(88L, 2), + 7, + Pair.of(88L, 2), + 8, + Pair.of(61L, 7)); + assertThat(mapAssignment.assignmentInfo()).isEqualTo(expectedAssignmentInfo); + + MapRangePartitioner partitioner = + new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapAssignment); + Map>> partitionResults = + runPartitioner(partitioner, numPartitions, mapStatistics); + // drift threshold is high for non-dividable scenario with close cost + validatePartitionResults(expectedAssignmentInfo, partitionResults, 10.0); + } + + private static Map>> runPartitioner( + MapRangePartitioner partitioner, int numPartitions, Map mapStatistics) { + // The Map key is the subtaskId. + // For the map value pair, the first element is the count of assigned and + // the second element of Set is for the set of assigned keys. + Map>> partitionResults = Maps.newHashMap(); + mapStatistics.forEach( + (sortKey, weight) -> { + String key = sortKey.get(0, String.class); + // run 100x times of the weight + long iterations = weight * 100; + for (int i = 0; i < iterations; ++i) { + RowData rowData = + GenericRowData.of( + StringData.fromString(key), 1, StringData.fromString("2023-06-20")); + int subtaskId = partitioner.partition(rowData, numPartitions); + partitionResults.computeIfAbsent( + subtaskId, k -> Pair.of(new AtomicLong(0), Sets.newHashSet())); + Pair> pair = partitionResults.get(subtaskId); + pair.first().incrementAndGet(); + pair.second().add(rowData); + } + }); + return partitionResults; + } + + /** + * @param expectedAssignmentInfo excluding closing cost + */ + private void validatePartitionResults( + Map> expectedAssignmentInfo, + Map>> partitionResults, + double maxDriftPercentage) { + + assertThat(partitionResults).hasSameSizeAs(expectedAssignmentInfo); + + List expectedAssignedKeyCounts = + Lists.newArrayListWithExpectedSize(expectedAssignmentInfo.size()); + List actualAssignedKeyCounts = + Lists.newArrayListWithExpectedSize(partitionResults.size()); + List expectedNormalizedWeights = + Lists.newArrayListWithExpectedSize(expectedAssignmentInfo.size()); + List actualNormalizedWeights = + Lists.newArrayListWithExpectedSize(partitionResults.size()); + + long expectedTotalWeight = + expectedAssignmentInfo.values().stream().mapToLong(Pair::first).sum(); + expectedAssignmentInfo.forEach( + (subtaskId, pair) -> { + expectedAssignedKeyCounts.add(pair.second()); + expectedNormalizedWeights.add(pair.first().doubleValue() / expectedTotalWeight); + }); + + long actualTotalWeight = + partitionResults.values().stream().mapToLong(pair -> pair.first().longValue()).sum(); + partitionResults.forEach( + (subtaskId, pair) -> { + actualAssignedKeyCounts.add(pair.second().size()); + actualNormalizedWeights.add(pair.first().doubleValue() / actualTotalWeight); + }); + + // number of assigned keys should match exactly + assertThat(actualAssignedKeyCounts) + .as("the number of assigned keys should match for every subtask") + .isEqualTo(expectedAssignedKeyCounts); + + // weight for every subtask shouldn't differ for more than some threshold relative to the + // expected weight + for (int subtaskId = 0; subtaskId < expectedNormalizedWeights.size(); ++subtaskId) { + double expectedWeight = expectedNormalizedWeights.get(subtaskId); + double min = expectedWeight * (1 - maxDriftPercentage / 100); + double max = expectedWeight * (1 + maxDriftPercentage / 100); + assertThat(actualNormalizedWeights.get(subtaskId)) + .as( + "Subtask %d weight should within %.1f percent of the expected range %s", + subtaskId, maxDriftPercentage, expectedWeight) + .isBetween(min, max); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java new file mode 100644 index 000000000000..0485fdb7fa04 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SCHEMA; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Set; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.junit.jupiter.api.Test; + +public class TestRangePartitioner { + private final int numPartitions = 4; + + @Test + public void testRoundRobinRecordsBeforeStatisticsAvailable() { + RangePartitioner partitioner = new RangePartitioner(SCHEMA, SORT_ORDER); + Set results = Sets.newHashSetWithExpectedSize(numPartitions); + for (int i = 0; i < numPartitions; ++i) { + results.add( + partitioner.partition( + StatisticsOrRecord.fromRecord(GenericRowData.of(StringData.fromString("a"), 1)), + numPartitions)); + } + + // round-robin. every partition should get an assignment + assertThat(results).containsExactlyInAnyOrder(0, 1, 2, 3); + } + + @Test + public void testRoundRobinStatisticsWrapper() { + RangePartitioner partitioner = new RangePartitioner(SCHEMA, SORT_ORDER); + Set results = Sets.newHashSetWithExpectedSize(numPartitions); + for (int i = 0; i < numPartitions; ++i) { + GlobalStatistics statistics = + GlobalStatistics.fromRangeBounds(1L, new SortKey[] {CHAR_KEYS.get("a")}); + results.add( + partitioner.partition(StatisticsOrRecord.fromStatistics(statistics), numPartitions)); + } + + // round-robin. every partition should get an assignment + assertThat(results).containsExactlyInAnyOrder(0, 1, 2, 3); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitionerSkew.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitionerSkew.java new file mode 100644 index 000000000000..d6d8aebc6350 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitionerSkew.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static java.lang.String.format; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.DoubleSummaryStatistics; +import java.util.IntSummaryStatistics; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.UUID; +import java.util.concurrent.ThreadLocalRandom; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.SortOrderComparators; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class TestRangePartitionerSkew { + private static final Logger LOG = LoggerFactory.getLogger(TestRangePartitionerSkew.class); + + // change the iterations to a larger number (like 100) to see the statistics of max skew. + // like min, max, avg, stddev of max skew. + private static final int ITERATIONS = 1; + + /** + * @param parallelism number of partitions + * @param maxSkewUpperBound the upper bound of max skew. maxSkewUpperBound is set to a loose bound + * (~5x of the max value) to avoid flakiness. + *

    + *

  • Map parallelism 8: max skew statistics over 100 iterations: mean = 0.0124, min = + * 0.0046, max = 0.0213 + *
  • Map parallelism 32: max skew statistics over 100 iterations: mean = 0.0183, min = + * 0.0100, max = 0.0261 + */ + @ParameterizedTest + @CsvSource({"8, 100_000, 0.1", "32, 400_000, 0.15"}) + public void testMapStatisticsSkewWithLongTailDistribution( + int parallelism, int sampleSize, double maxSkewUpperBound) { + Schema schema = + new Schema(Types.NestedField.optional(1, "event_hour", Types.IntegerType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).asc("event_hour").build(); + Comparator comparator = SortOrderComparators.forSchema(schema, sortOrder); + SortKey sortKey = new SortKey(schema, sortOrder); + + NavigableMap weights = + DataDistributionUtil.longTailDistribution(100_000, 24, 240, 100, 2.0, 0.7); + Map mapStatistics = + DataDistributionUtil.mapStatisticsWithLongTailDistribution(weights, sortKey); + MapAssignment mapAssignment = + MapAssignment.fromKeyFrequency(parallelism, mapStatistics, 0.0, comparator); + MapRangePartitioner partitioner = new MapRangePartitioner(schema, sortOrder, mapAssignment); + + List keys = Lists.newArrayList(weights.keySet().iterator()); + long[] weightsCDF = DataDistributionUtil.computeCumulativeWeights(keys, weights); + long totalWeight = weightsCDF[weightsCDF.length - 1]; + + // change the iterations to a larger number (like 100) to see the statistics of max skew. + // like min, max, avg, stddev of max skew. + double[] maxSkews = new double[ITERATIONS]; + for (int iteration = 0; iteration < ITERATIONS; ++iteration) { + int[] recordsPerTask = new int[parallelism]; + for (int i = 0; i < sampleSize; ++i) { + // randomly pick a key according to the weight distribution + long weight = ThreadLocalRandom.current().nextLong(totalWeight); + int index = DataDistributionUtil.binarySearchIndex(weightsCDF, weight); + RowData row = GenericRowData.of(keys.get(index)); + int subtaskId = partitioner.partition(row, parallelism); + recordsPerTask[subtaskId] += 1; + } + + IntSummaryStatistics recordsPerTaskStats = Arrays.stream(recordsPerTask).summaryStatistics(); + LOG.debug("Map parallelism {}: records per task stats: {}", parallelism, recordsPerTaskStats); + double maxSkew = + (recordsPerTaskStats.getMax() - recordsPerTaskStats.getAverage()) + / recordsPerTaskStats.getAverage(); + LOG.debug("Map parallelism {}: max skew: {}", parallelism, format("%.03f", maxSkew)); + assertThat(maxSkew).isLessThan(maxSkewUpperBound); + maxSkews[iteration] = maxSkew; + } + + DoubleSummaryStatistics maxSkewStats = Arrays.stream(maxSkews).summaryStatistics(); + LOG.info( + "Map parallelism {}: max skew statistics over {} iterations: mean = {}, min = {}, max = {}", + parallelism, + ITERATIONS, + format("%.4f", maxSkewStats.getAverage()), + format("%.4f", maxSkewStats.getMin()), + format("%.4f", maxSkewStats.getMax())); + } + + /** + * @param parallelism number of partitions + * @param maxSkewUpperBound the upper bound of max skew. maxSkewUpperBound is set to a loose bound + * (~5x of the max value) to avoid flakiness. + *

    + *

  • pMap parallelism 8: max skew statistics over 100 iterations: mean = 0.0192, min = + * 0.0073, max = 0.0437 + *
  • Map parallelism 32: max skew statistics over 100 iterations: mean = 0.0426, min = + * 0.0262, max = 0.0613 + */ + @ParameterizedTest + @CsvSource({"8, 100_000, 0.20", "32, 400_000, 0.25"}) + public void testSketchStatisticsSkewWithLongTailDistribution( + int parallelism, int sampleSize, double maxSkewUpperBound) { + Schema schema = new Schema(Types.NestedField.optional(1, "uuid", Types.UUIDType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).asc("uuid").build(); + SortKey sortKey = new SortKey(schema, sortOrder); + + UUID[] reservoir = DataDistributionUtil.reservoirSampleUUIDs(1_000_000, 100_000); + UUID[] rangeBound = DataDistributionUtil.rangeBoundSampleUUIDs(reservoir, parallelism); + SortKey[] rangeBoundSortKeys = + Arrays.stream(rangeBound) + .map( + uuid -> { + SortKey sortKeyCopy = sortKey.copy(); + sortKeyCopy.set(0, uuid); + return sortKeyCopy; + }) + .toArray(SortKey[]::new); + + SketchRangePartitioner partitioner = + new SketchRangePartitioner(schema, sortOrder, rangeBoundSortKeys); + + double[] maxSkews = new double[ITERATIONS]; + for (int iteration = 0; iteration < ITERATIONS; ++iteration) { + int[] recordsPerTask = new int[parallelism]; + for (int i = 0; i < sampleSize; ++i) { + UUID uuid = UUID.randomUUID(); + Object uuidBytes = DataDistributionUtil.uuidBytes(uuid); + RowData row = GenericRowData.of(uuidBytes); + int subtaskId = partitioner.partition(row, parallelism); + recordsPerTask[subtaskId] += 1; + } + + IntSummaryStatistics recordsPerTaskStats = Arrays.stream(recordsPerTask).summaryStatistics(); + LOG.debug("Map parallelism {}: records per task stats: {}", parallelism, recordsPerTaskStats); + double maxSkew = + (recordsPerTaskStats.getMax() - recordsPerTaskStats.getAverage()) + / recordsPerTaskStats.getAverage(); + LOG.debug("Map parallelism {}: max skew: {}", parallelism, format("%.03f", maxSkew)); + assertThat(maxSkew).isLessThan(maxSkewUpperBound); + maxSkews[iteration] = maxSkew; + } + + DoubleSummaryStatistics maxSkewStats = Arrays.stream(maxSkews).summaryStatistics(); + LOG.info( + "Map parallelism {}: max skew statistics over {} iterations: mean = {}, min = {}, max = {}", + parallelism, + ITERATIONS, + format("%.4f", maxSkewStats.getAverage()), + format("%.4f", maxSkewStats.getMin()), + format("%.4f", maxSkewStats.getMax())); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java new file mode 100644 index 000000000000..396bfae2f13c --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.ROW_WRAPPER; +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.datasketches.sampling.ReservoirItemsSketch; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.SortKey; +import org.junit.jupiter.api.Test; + +public class TestSketchDataStatistics { + @SuppressWarnings("unchecked") + @Test + public void testAddsAndGet() { + SketchDataStatistics dataStatistics = new SketchDataStatistics(128); + + GenericRowData reusedRow = GenericRowData.of(StringData.fromString("a"), 1); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + reusedRow.setField(0, StringData.fromString("b")); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + reusedRow.setField(0, StringData.fromString("c")); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + reusedRow.setField(0, StringData.fromString("b")); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + ReservoirItemsSketch actual = (ReservoirItemsSketch) dataStatistics.result(); + assertThat(actual.getSamples()) + .isEqualTo( + new SortKey[] { + CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("c"), CHAR_KEYS.get("b") + }); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java new file mode 100644 index 000000000000..378c6afff077 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.TestFixtures; +import org.junit.jupiter.api.Test; + +public class TestSketchRangePartitioner { + // sort on the long id field + private static final SortOrder SORT_ORDER = + SortOrder.builderFor(TestFixtures.SCHEMA).asc("id").build(); + private static final SortKey SORT_KEY = new SortKey(TestFixtures.SCHEMA, SORT_ORDER); + private static final RowType ROW_TYPE = FlinkSchemaUtil.convert(TestFixtures.SCHEMA); + private static final int NUM_PARTITIONS = 16; + private static final long RANGE_STEP = 1_000; + private static final long MAX_ID = RANGE_STEP * NUM_PARTITIONS; + private static final SortKey[] RANGE_BOUNDS = createRangeBounds(); + + /** + * To understand how range bounds are used in range partitioning, here is an example for human + * ages with 4 partitions: [15, 32, 60]. The 4 ranges would be + * + *
      + *
    • age <= 15 + *
    • age > 15 && age <= 32 + *
    • age >32 && age <= 60 + *
    • age > 60 + *
    + */ + private static SortKey[] createRangeBounds() { + SortKey[] rangeBounds = new SortKey[NUM_PARTITIONS - 1]; + for (int i = 0; i < NUM_PARTITIONS - 1; ++i) { + RowData rowData = + GenericRowData.of( + StringData.fromString("data"), + RANGE_STEP * (i + 1), + StringData.fromString("2023-06-20")); + RowDataWrapper keyWrapper = new RowDataWrapper(ROW_TYPE, TestFixtures.SCHEMA.asStruct()); + keyWrapper.wrap(rowData); + SortKey sortKey = new SortKey(TestFixtures.SCHEMA, SORT_ORDER); + sortKey.wrap(keyWrapper); + rangeBounds[i] = sortKey; + } + + return rangeBounds; + } + + @Test + public void testRangePartitioningWithRangeBounds() { + SketchRangePartitioner partitioner = + new SketchRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, RANGE_BOUNDS); + GenericRowData row = + GenericRowData.of(StringData.fromString("data"), 0L, StringData.fromString("2023-06-20")); + for (long id = 0; id < MAX_ID; ++id) { + row.setField(1, id); + int partition = partitioner.partition(row, NUM_PARTITIONS); + assertThat(partition).isGreaterThanOrEqualTo(0).isLessThan(NUM_PARTITIONS); + int expectedPartition = id == 0L ? 0 : (int) ((id - 1) / RANGE_STEP); + assertThat(partition).isEqualTo(expectedPartition); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java new file mode 100644 index 000000000000..a0f660a965ef --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.iceberg.SortKey; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +public class TestSketchUtil { + @Test + public void testCoordinatorReservoirSize() { + // adjusted to over min threshold of 10_000 and is divisible by number of partitions (3) + assertThat(SketchUtil.determineCoordinatorReservoirSize(3)).isEqualTo(10_002); + // adjust to multiplier of 100 + assertThat(SketchUtil.determineCoordinatorReservoirSize(123)).isEqualTo(123_00); + // adjusted to below max threshold of 1_000_000 and is divisible by number of partitions (3) + assertThat(SketchUtil.determineCoordinatorReservoirSize(10_123)) + .isEqualTo(1_000_000 - (1_000_000 % 10_123)); + } + + @Test + public void testOperatorReservoirSize() { + assertThat(SketchUtil.determineOperatorReservoirSize(5, 3)) + .isEqualTo((10_002 * SketchUtil.OPERATOR_OVER_SAMPLE_RATIO) / 5); + assertThat(SketchUtil.determineOperatorReservoirSize(123, 123)) + .isEqualTo((123_00 * SketchUtil.OPERATOR_OVER_SAMPLE_RATIO) / 123); + assertThat(SketchUtil.determineOperatorReservoirSize(256, 123)) + .isEqualTo( + (int) Math.ceil((double) (123_00 * SketchUtil.OPERATOR_OVER_SAMPLE_RATIO) / 256)); + assertThat(SketchUtil.determineOperatorReservoirSize(5_120, 10_123)) + .isEqualTo( + (int) Math.ceil((double) (992_054 * SketchUtil.OPERATOR_OVER_SAMPLE_RATIO) / 5_120)); + } + + @Test + public void testRangeBoundsOneChannel() { + assertThat( + SketchUtil.rangeBounds( + 1, + SORT_ORDER_COMPARTOR, + new SortKey[] { + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("d"), + CHAR_KEYS.get("e"), + CHAR_KEYS.get("f") + })) + .isEmpty(); + } + + @Test + public void testRangeBoundsDivisible() { + assertThat( + SketchUtil.rangeBounds( + 3, + SORT_ORDER_COMPARTOR, + new SortKey[] { + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("d"), + CHAR_KEYS.get("e"), + CHAR_KEYS.get("f") + })) + .containsExactly(CHAR_KEYS.get("b"), CHAR_KEYS.get("d")); + } + + @Test + public void testRangeBoundsNonDivisible() { + // step is 3 = ceiling(11/4) + assertThat( + SketchUtil.rangeBounds( + 4, + SORT_ORDER_COMPARTOR, + new SortKey[] { + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("d"), + CHAR_KEYS.get("e"), + CHAR_KEYS.get("f"), + CHAR_KEYS.get("g"), + CHAR_KEYS.get("h"), + CHAR_KEYS.get("i"), + CHAR_KEYS.get("j"), + CHAR_KEYS.get("k"), + })) + .containsExactly(CHAR_KEYS.get("c"), CHAR_KEYS.get("f"), CHAR_KEYS.get("i")); + } + + @Test + public void testRangeBoundsSkipDuplicates() { + // step is 3 = ceiling(11/4) + assertThat( + SketchUtil.rangeBounds( + 4, + SORT_ORDER_COMPARTOR, + new SortKey[] { + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("g"), + CHAR_KEYS.get("h"), + CHAR_KEYS.get("i"), + CHAR_KEYS.get("j"), + CHAR_KEYS.get("k"), + })) + // skipped duplicate c's + .containsExactly(CHAR_KEYS.get("c"), CHAR_KEYS.get("g"), CHAR_KEYS.get("j")); + } + + @Test + public void testRangeBoundsNumPartitionsBiggerThanSortKeyCount() { + assertThat( + SketchUtil.rangeBounds( + 5, + SORT_ORDER_COMPARTOR, + new SortKey[] {CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("c")})) + .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("c")) + .doesNotContainNull(); + } + + @ParameterizedTest + @ValueSource(ints = {4, 6}) + public void testPartitioningAndScaleUp(int numPartitions) { + // Range bounds are calculated based on 4 partitions + SortKey[] rangeBounds = + new SortKey[] {CHAR_KEYS.get("c"), CHAR_KEYS.get("j"), CHAR_KEYS.get("m")}; + + // <= c + assertPartition(0, CHAR_KEYS.get("a"), numPartitions, rangeBounds); + assertPartition(0, CHAR_KEYS.get("c"), numPartitions, rangeBounds); + // > c && <= j + assertPartition(1, CHAR_KEYS.get("d"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("i"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("j"), numPartitions, rangeBounds); + // > j && <= m + assertPartition(2, CHAR_KEYS.get("k"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("l"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("m"), numPartitions, rangeBounds); + // > m + assertPartition(3, CHAR_KEYS.get("n"), numPartitions, rangeBounds); + assertPartition(3, CHAR_KEYS.get("z"), numPartitions, rangeBounds); + } + + @Test + public void testPartitionScaleDown() { + // Range bounds are calculated based on 4 partitions + SortKey[] rangeBounds = + new SortKey[] {CHAR_KEYS.get("c"), CHAR_KEYS.get("j"), CHAR_KEYS.get("m")}; + int numPartitions = 3; + + // <= c + assertPartition(0, CHAR_KEYS.get("a"), numPartitions, rangeBounds); + assertPartition(0, CHAR_KEYS.get("c"), numPartitions, rangeBounds); + // > c && <= j + assertPartition(1, CHAR_KEYS.get("d"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("i"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("j"), numPartitions, rangeBounds); + // > j && <= m + assertPartition(2, CHAR_KEYS.get("k"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("l"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("m"), numPartitions, rangeBounds); + // > m + // reassigns out-of-range partitions via mod (% 3 in this case) + assertPartition(0, CHAR_KEYS.get("n"), numPartitions, rangeBounds); + assertPartition(0, CHAR_KEYS.get("z"), numPartitions, rangeBounds); + } + + private static void assertPartition( + int expectedPartition, SortKey key, int numPartitions, SortKey[] rangeBounds) { + assertThat(SketchUtil.partition(key, numPartitions, rangeBounds, SORT_ORDER_COMPARTOR)) + .isEqualTo(expectedPartition); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java new file mode 100644 index 000000000000..c7fea015142c --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import org.apache.flink.api.common.typeutils.SerializerTestBase; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.table.data.GenericRowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; + +public abstract class TestSortKeySerializerBase extends SerializerTestBase { + + protected abstract Schema schema(); + + protected abstract SortOrder sortOrder(); + + protected abstract GenericRowData rowData(); + + @Override + protected TypeSerializer createSerializer() { + return new SortKeySerializer(schema(), sortOrder()); + } + + @Override + protected int getLength() { + return -1; + } + + @Override + protected Class getTypeClass() { + return SortKey.class; + } + + @Override + protected SortKey[] getTestData() { + return new SortKey[] {sortKey()}; + } + + private SortKey sortKey() { + RowDataWrapper rowDataWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(schema()), schema().asStruct()); + SortKey sortKey = new SortKey(schema(), sortOrder()); + sortKey.wrap(rowDataWrapper.wrap(rowData())); + return sortKey; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java new file mode 100644 index 000000000000..0000688a8b55 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import org.apache.flink.table.data.GenericRowData; +import org.apache.iceberg.NullOrder; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortDirection; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.DataGenerator; +import org.apache.iceberg.flink.DataGenerators; + +public class TestSortKeySerializerNestedStruct extends TestSortKeySerializerBase { + private final DataGenerator generator = new DataGenerators.StructOfStruct(); + + @Override + protected Schema schema() { + return generator.icebergSchema(); + } + + @Override + protected SortOrder sortOrder() { + return SortOrder.builderFor(schema()) + .asc("row_id") + .sortBy( + Expressions.bucket("struct_of_struct.id", 4), SortDirection.DESC, NullOrder.NULLS_LAST) + .sortBy( + Expressions.truncate("struct_of_struct.person_struct.name", 16), + SortDirection.ASC, + NullOrder.NULLS_FIRST) + .build(); + } + + @Override + protected GenericRowData rowData() { + return generator.generateFlinkRowData(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java new file mode 100644 index 000000000000..ac2e2784e681 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; + +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.NullOrder; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortDirection; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.DataGenerator; +import org.apache.iceberg.flink.DataGenerators; +import org.apache.iceberg.flink.RowDataWrapper; +import org.junit.jupiter.api.Test; + +public class TestSortKeySerializerPrimitives extends TestSortKeySerializerBase { + private final DataGenerator generator = new DataGenerators.Primitives(); + + @Override + protected Schema schema() { + return generator.icebergSchema(); + } + + @Override + protected SortOrder sortOrder() { + return SortOrder.builderFor(schema()) + .asc("boolean_field") + .sortBy(Expressions.bucket("int_field", 4), SortDirection.DESC, NullOrder.NULLS_LAST) + .sortBy(Expressions.truncate("string_field", 2), SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy(Expressions.bucket("uuid_field", 16), SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy(Expressions.hour("ts_with_zone_field"), SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy(Expressions.day("ts_without_zone_field"), SortDirection.ASC, NullOrder.NULLS_FIRST) + // can not test HeapByteBuffer due to equality test inside SerializerTestBase + // .sortBy(Expressions.truncate("binary_field", 2), SortDirection.ASC, + // NullOrder.NULLS_FIRST) + .build(); + } + + @Override + protected GenericRowData rowData() { + return generator.generateFlinkRowData(); + } + + @Test + public void testSerializationSize() throws Exception { + RowData rowData = + GenericRowData.of(StringData.fromString("550e8400-e29b-41d4-a716-446655440000"), 1L); + RowDataWrapper rowDataWrapper = + new RowDataWrapper(Fixtures.ROW_TYPE, Fixtures.SCHEMA.asStruct()); + StructLike struct = rowDataWrapper.wrap(rowData); + SortKey sortKey = Fixtures.SORT_KEY.copy(); + sortKey.wrap(struct); + SortKeySerializer serializer = new SortKeySerializer(Fixtures.SCHEMA, Fixtures.SORT_ORDER); + DataOutputSerializer output = new DataOutputSerializer(1024); + serializer.serialize(sortKey, output); + byte[] serializedBytes = output.getCopyOfBuffer(); + assertThat(serializedBytes.length) + .as( + "Serialized bytes for sort key should be 39 bytes (34 UUID text + 4 byte integer of string length + 1 byte of isnull flag") + .isEqualTo(39); + + DataInputDeserializer input = new DataInputDeserializer(serializedBytes); + SortKey deserialized = serializer.deserialize(input); + assertThat(deserialized).isEqualTo(sortKey); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java new file mode 100644 index 000000000000..2d87b089cecb --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.ROW_TYPE; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SCHEMA; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_KEY; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER; +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; + +import java.io.IOException; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.api.common.typeutils.TypeSerializerSchemaCompatibility; +import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataInputView; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestSortKeySerializerSnapshot { + private final Schema schema = + new Schema( + Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(2, "str", Types.StringType.get()), + Types.NestedField.optional(3, "int", Types.IntegerType.get()), + Types.NestedField.optional(4, "boolean", Types.BooleanType.get())); + private final SortOrder sortOrder = SortOrder.builderFor(schema).asc("str").asc("int").build(); + + @Test + public void testRestoredSerializer() throws Exception { + RowData rowData = GenericRowData.of(StringData.fromString("str"), 1); + RowDataWrapper rowDataWrapper = new RowDataWrapper(ROW_TYPE, SCHEMA.asStruct()); + StructLike struct = rowDataWrapper.wrap(rowData); + SortKey sortKey = SORT_KEY.copy(); + sortKey.wrap(struct); + + SortKeySerializer originalSerializer = new SortKeySerializer(SCHEMA, SORT_ORDER); + TypeSerializerSnapshot snapshot = + roundTrip(originalSerializer.snapshotConfiguration()); + TypeSerializer restoredSerializer = snapshot.restoreSerializer(); + + DataOutputSerializer output = new DataOutputSerializer(1024); + originalSerializer.serialize(sortKey, output); + byte[] serializedBytes = output.getCopyOfBuffer(); + + DataInputDeserializer input = new DataInputDeserializer(serializedBytes); + SortKey deserialized = restoredSerializer.deserialize(input); + assertThat(deserialized).isEqualTo(sortKey); + } + + @Test + public void testRestoredOldSerializer() throws Exception { + RowData rowData = GenericRowData.of(StringData.fromString("str"), 1); + RowDataWrapper rowDataWrapper = new RowDataWrapper(ROW_TYPE, SCHEMA.asStruct()); + StructLike struct = rowDataWrapper.wrap(rowData); + SortKey sortKey = SORT_KEY.copy(); + sortKey.wrap(struct); + + SortKeySerializer originalSerializer = new SortKeySerializer(SCHEMA, SORT_ORDER, 1); + TypeSerializerSnapshot snapshot = + roundTrip(originalSerializer.snapshotConfiguration()); + TypeSerializer restoredSerializer = snapshot.restoreSerializer(); + ((SortKeySerializer) restoredSerializer).setVersion(1); + DataOutputSerializer output = new DataOutputSerializer(1024); + originalSerializer.serialize(sortKey, output); + byte[] serializedBytes = output.getCopyOfBuffer(); + + DataInputDeserializer input = new DataInputDeserializer(serializedBytes); + SortKey deserialized = restoredSerializer.deserialize(input); + assertThat(deserialized).isEqualTo(sortKey); + } + + @Test + public void testSnapshotIsCompatibleWithSameSortOrder() throws Exception { + SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = + new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); + + SortKeySerializer.SortKeySerializerSnapshot newSnapshot = + roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder)); + + TypeSerializerSchemaCompatibility resultCompatibility = + newSnapshot.resolveSchemaCompatibility(oldSnapshot); + assertThat(resultCompatibility.isCompatibleAsIs()).isTrue(); + } + + @Test + public void testSnapshotIsCompatibleWithRemoveNonSortField() throws Exception { + SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = + new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); + + // removed non-sort boolean field + Schema newSchema = + new Schema( + Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(2, "str", Types.StringType.get()), + Types.NestedField.optional(3, "int", Types.IntegerType.get())); + SortOrder newSortOrder = SortOrder.builderFor(newSchema).asc("str").asc("int").build(); + SortKeySerializer.SortKeySerializerSnapshot newSnapshot = + roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(newSchema, newSortOrder)); + + TypeSerializerSchemaCompatibility resultCompatibility = + newSnapshot.resolveSchemaCompatibility(oldSnapshot); + assertThat(resultCompatibility.isCompatibleAsIs()).isTrue(); + } + + @Test + public void testSnapshotIsCompatibleWithAddNonSortField() throws Exception { + SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = + new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); + + // add a new non-sort float field + Schema newSchema = + new Schema( + Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(2, "str", Types.StringType.get()), + Types.NestedField.optional(3, "int", Types.IntegerType.get()), + Types.NestedField.optional(4, "boolean", Types.BooleanType.get()), + Types.NestedField.required(5, "float", Types.FloatType.get())); + SortOrder newSortOrder = SortOrder.builderFor(newSchema).asc("str").asc("int").build(); + SortKeySerializer.SortKeySerializerSnapshot newSnapshot = + roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(newSchema, newSortOrder)); + + TypeSerializerSchemaCompatibility resultCompatibility = + newSnapshot.resolveSchemaCompatibility(oldSnapshot); + assertThat(resultCompatibility.isCompatibleAsIs()).isTrue(); + } + + @Test + public void testSnapshotIsIncompatibleWithIncompatibleSchema() throws Exception { + SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = + new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); + + // change str field to a long type + Schema newSchema = + new Schema( + Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(2, "str", Types.LongType.get()), + Types.NestedField.optional(3, "int", Types.IntegerType.get()), + Types.NestedField.optional(4, "boolean", Types.BooleanType.get())); + SortOrder newSortOrder = SortOrder.builderFor(newSchema).asc("str").asc("int").build(); + // switch sort field order + SortKeySerializer.SortKeySerializerSnapshot newSnapshot = + roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(newSchema, newSortOrder)); + + TypeSerializerSchemaCompatibility resultCompatibility = + newSnapshot.resolveSchemaCompatibility(oldSnapshot); + assertThat(resultCompatibility.isIncompatible()).isTrue(); + } + + @Test + public void testSnapshotIsIncompatibleWithAddSortField() throws Exception { + SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = + new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); + + // removed str field from sort order + SortOrder newSortOrder = + SortOrder.builderFor(schema).asc("str").asc("int").desc("boolean").build(); + SortKeySerializer.SortKeySerializerSnapshot newSnapshot = + roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(schema, newSortOrder)); + + TypeSerializerSchemaCompatibility resultCompatibility = + newSnapshot.resolveSchemaCompatibility(oldSnapshot); + assertThat(resultCompatibility.isIncompatible()).isTrue(); + } + + @Test + public void testSnapshotIsIncompatibleWithRemoveSortField() throws Exception { + SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = + new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); + + // remove str field from sort order + SortOrder newSortOrder = SortOrder.builderFor(schema).asc("int").build(); + SortKeySerializer.SortKeySerializerSnapshot newSnapshot = + roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(schema, newSortOrder)); + + TypeSerializerSchemaCompatibility resultCompatibility = + newSnapshot.resolveSchemaCompatibility(oldSnapshot); + assertThat(resultCompatibility.isIncompatible()).isTrue(); + } + + @Test + public void testSnapshotIsIncompatibleWithSortFieldsOrderChange() throws Exception { + SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = + new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); + + // switch sort field order + SortOrder newSortOrder = SortOrder.builderFor(schema).asc("int").asc("str").build(); + SortKeySerializer.SortKeySerializerSnapshot newSnapshot = + roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(schema, newSortOrder)); + + TypeSerializerSchemaCompatibility resultCompatibility = + newSnapshot.resolveSchemaCompatibility(oldSnapshot); + assertThat(resultCompatibility.isIncompatible()).isTrue(); + } + + /** Copied from Flink {@code AvroSerializerSnapshotTest} */ + private static SortKeySerializer.SortKeySerializerSnapshot roundTrip( + TypeSerializerSnapshot original) throws IOException { + // writeSnapshot(); + DataOutputSerializer out = new DataOutputSerializer(1024); + original.writeSnapshot(out); + // init + SortKeySerializer.SortKeySerializerSnapshot restored = + new SortKeySerializer.SortKeySerializerSnapshot(); + // readSnapshot(); + DataInputView in = new DataInputDeserializer(out.wrapAsByteBuffer()); + restored.readSnapshot(restored.getCurrentVersion(), in, original.getClass().getClassLoader()); + return restored; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java new file mode 100644 index 000000000000..1be7e27f2c01 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.iceberg.NullOrder; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortDirection; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestSortKeyUtil { + @Test + public void testResultSchema() { + Schema schema = + new Schema( + Types.NestedField.required(1, "id", Types.StringType.get()), + Types.NestedField.required(2, "ratio", Types.DoubleType.get()), + Types.NestedField.optional( + 3, + "user", + Types.StructType.of( + Types.NestedField.required(11, "name", Types.StringType.get()), + Types.NestedField.required(12, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(13, "device_id", Types.UUIDType.get()), + Types.NestedField.optional( + 14, + "location", + Types.StructType.of( + Types.NestedField.required(101, "lat", Types.FloatType.get()), + Types.NestedField.required(102, "long", Types.FloatType.get()), + Types.NestedField.required(103, "blob", Types.BinaryType.get())))))); + + SortOrder sortOrder = + SortOrder.builderFor(schema) + .asc("ratio") + .sortBy(Expressions.hour("user.ts"), SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy( + Expressions.bucket("user.device_id", 16), SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy( + Expressions.truncate("user.location.blob", 16), + SortDirection.ASC, + NullOrder.NULLS_FIRST) + .build(); + + assertThat(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()) + .isEqualTo( + Types.StructType.of( + Types.NestedField.required(0, "ratio_0", Types.DoubleType.get()), + Types.NestedField.required(1, "ts_1", Types.IntegerType.get()), + Types.NestedField.optional(2, "device_id_2", Types.IntegerType.get()), + Types.NestedField.required(3, "blob_3", Types.BinaryType.get()))); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestStatisticsOrRecordTypeInformation.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestStatisticsOrRecordTypeInformation.java new file mode 100644 index 000000000000..f54198522e99 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestStatisticsOrRecordTypeInformation.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import org.apache.flink.api.common.typeutils.TypeInformationTestBase; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.types.Types; + +public class TestStatisticsOrRecordTypeInformation + extends TypeInformationTestBase { + private static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(2, "uuid", Types.UUIDType.get()), + Types.NestedField.optional(3, "data", Types.StringType.get())); + private static final RowType ROW_TYPE = FlinkSchemaUtil.convert(SCHEMA); + private static final SortOrder SORT_ORDER1 = SortOrder.builderFor(SCHEMA).asc("ts").build(); + private static final SortOrder SORT_ORDER2 = SortOrder.builderFor(SCHEMA).asc("data").build(); + + @Override + protected StatisticsOrRecordTypeInformation[] getTestData() { + return new StatisticsOrRecordTypeInformation[] { + new StatisticsOrRecordTypeInformation(ROW_TYPE, SCHEMA, SORT_ORDER1), + new StatisticsOrRecordTypeInformation(ROW_TYPE, SCHEMA, SORT_ORDER2), + }; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java new file mode 100644 index 000000000000..134858f5055e --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.legacy.SourceFunction; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.table.connector.ProviderContext; +import org.apache.flink.table.connector.source.DataStreamScanProvider; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.connector.source.ScanTableSource; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.util.DataFormatConverters; +import org.apache.flink.table.factories.DynamicTableSourceFactory; +import org.apache.flink.table.runtime.typeutils.ExternalTypeInfo; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +public class BoundedTableFactory implements DynamicTableSourceFactory { + private static final AtomicInteger DATA_SET_ID = new AtomicInteger(0); + private static final Map>> DATA_SETS = Maps.newHashMap(); + + private static final ConfigOption DATA_ID = + ConfigOptions.key("data-id").stringType().noDefaultValue(); + + public static String registerDataSet(List> dataSet) { + String dataSetId = String.valueOf(DATA_SET_ID.incrementAndGet()); + DATA_SETS.put(dataSetId, dataSet); + return dataSetId; + } + + public static void clearDataSets() { + DATA_SETS.clear(); + } + + @Override + public DynamicTableSource createDynamicTableSource(Context context) { + ResolvedSchema resolvedSchema = + ResolvedSchema.of( + context.getCatalogTable().getResolvedSchema().getColumns().stream() + .filter(Column::isPhysical) + .collect(Collectors.toList())); + + Configuration configuration = Configuration.fromMap(context.getCatalogTable().getOptions()); + String dataId = configuration.get(DATA_ID); + Preconditions.checkArgument( + DATA_SETS.containsKey(dataId), "data-id %s does not found in registered data set.", dataId); + + return new BoundedTableSource(DATA_SETS.get(dataId), resolvedSchema); + } + + @Override + public String factoryIdentifier() { + return "BoundedSource"; + } + + @Override + public Set> requiredOptions() { + return ImmutableSet.of(); + } + + @Override + public Set> optionalOptions() { + return ImmutableSet.of(DATA_ID); + } + + private static class BoundedTableSource implements ScanTableSource { + + private final List> elementsPerCheckpoint; + private final ResolvedSchema resolvedSchema; + + private BoundedTableSource( + List> elementsPerCheckpoint, ResolvedSchema resolvedSchema) { + this.elementsPerCheckpoint = elementsPerCheckpoint; + this.resolvedSchema = resolvedSchema; + } + + private BoundedTableSource(BoundedTableSource toCopy) { + this.elementsPerCheckpoint = toCopy.elementsPerCheckpoint; + this.resolvedSchema = toCopy.resolvedSchema; + } + + @Override + public ChangelogMode getChangelogMode() { + Supplier> supplier = () -> elementsPerCheckpoint.stream().flatMap(List::stream); + + // Add the INSERT row kind by default. + ChangelogMode.Builder builder = ChangelogMode.newBuilder().addContainedKind(RowKind.INSERT); + + if (supplier.get().anyMatch(r -> r.getKind() == RowKind.DELETE)) { + builder.addContainedKind(RowKind.DELETE); + } + + if (supplier.get().anyMatch(r -> r.getKind() == RowKind.UPDATE_BEFORE)) { + builder.addContainedKind(RowKind.UPDATE_BEFORE); + } + + if (supplier.get().anyMatch(r -> r.getKind() == RowKind.UPDATE_AFTER)) { + builder.addContainedKind(RowKind.UPDATE_AFTER); + } + + return builder.build(); + } + + @Override + public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderContext) { + return new DataStreamScanProvider() { + @Override + public DataStream produceDataStream( + ProviderContext providerContext, StreamExecutionEnvironment env) { + boolean checkpointEnabled = env.getCheckpointConfig().isCheckpointingEnabled(); + SourceFunction source = + new BoundedTestSource<>(elementsPerCheckpoint, checkpointEnabled); + + RowType rowType = (RowType) resolvedSchema.toSourceRowDataType().getLogicalType(); + // Converter to convert the Row to RowData. + DataFormatConverters.RowConverter rowConverter = + new DataFormatConverters.RowConverter( + resolvedSchema.getColumnDataTypes().toArray(DataType[]::new)); + + return env.addSource( + source, + new RowTypeInfo( + resolvedSchema.getColumnDataTypes().stream() + .map(ExternalTypeInfo::of) + .toArray(TypeInformation[]::new))) + .map(rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)); + } + + @Override + public boolean isBounded() { + return true; + } + }; + } + + @Override + public DynamicTableSource copy() { + return new BoundedTableSource(this); + } + + @Override + public String asSummaryString() { + return "Bounded test table source"; + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java new file mode 100644 index 000000000000..e412006176b9 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.flink.api.common.state.CheckpointListener; +import org.apache.flink.streaming.api.functions.source.legacy.SourceFunction; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * A stream source that: 1) emits the elements from elementsPerCheckpoint.get(0) without allowing + * checkpoints. 2) then waits for the checkpoint to complete. 3) emits the elements from + * elementsPerCheckpoint.get(1) without allowing checkpoints. 4) then waits for the checkpoint to + * complete. 5) ... + * + *

    Util all the list from elementsPerCheckpoint are exhausted. + */ +public final class BoundedTestSource implements SourceFunction, CheckpointListener { + + private final List> elementsPerCheckpoint; + private final boolean checkpointEnabled; + private volatile boolean running = true; + + private final AtomicInteger numCheckpointsComplete = new AtomicInteger(0); + + /** Emits all those elements in several checkpoints. */ + public BoundedTestSource(List> elementsPerCheckpoint, boolean checkpointEnabled) { + this.elementsPerCheckpoint = elementsPerCheckpoint; + this.checkpointEnabled = checkpointEnabled; + } + + public BoundedTestSource(List> elementsPerCheckpoint) { + this(elementsPerCheckpoint, true); + } + + /** Emits all those elements in a single checkpoint. */ + public BoundedTestSource(T... elements) { + this(Collections.singletonList(Arrays.asList(elements))); + } + + @Override + public void run(SourceContext ctx) throws Exception { + if (!checkpointEnabled) { + Preconditions.checkArgument( + elementsPerCheckpoint.size() <= 1, + "There should be at most one list in the elementsPerCheckpoint when checkpoint is disabled."); + elementsPerCheckpoint.stream().flatMap(List::stream).forEach(ctx::collect); + return; + } + + for (List elements : elementsPerCheckpoint) { + + final int checkpointToAwait; + synchronized (ctx.getCheckpointLock()) { + // Let's say checkpointToAwait = numCheckpointsComplete.get() + delta, in fact the value of + // delta should not + // affect the final table records because we only need to make sure that there will be + // exactly + // elementsPerCheckpoint.size() checkpoints to emit each records buffer from the original + // elementsPerCheckpoint. + // Even if the checkpoints that emitted results are not continuous, the correctness of the + // data should not be + // affected in the end. Setting the delta to be 2 is introducing the variable that produce + // un-continuous + // checkpoints that emit the records buffer from elementsPerCheckpoints. + checkpointToAwait = numCheckpointsComplete.get() + 2; + for (T element : elements) { + ctx.collect(element); + } + } + + synchronized (ctx.getCheckpointLock()) { + while (running && numCheckpointsComplete.get() < checkpointToAwait) { + ctx.getCheckpointLock().wait(1); + } + } + } + } + + @Override + public void notifyCheckpointComplete(long checkpointId) throws Exception { + numCheckpointsComplete.incrementAndGet(); + } + + @Override + public void cancel() { + running = false; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java new file mode 100644 index 000000000000..5dfbbe3abe73 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.TestBase; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestInfo; + +public class ChangeLogTableTestBase extends TestBase { + private volatile TableEnvironment tEnv = null; + + protected String tableName; + + @BeforeEach + public void setup(TestInfo testInfo) { + assertThat(testInfo.getTestMethod()).isPresent(); + this.tableName = testInfo.getTestMethod().get().getName(); + } + + @AfterEach + public void clean() { + sql("DROP TABLE IF EXISTS %s", tableName); + BoundedTableFactory.clearDataSets(); + } + + @Override + protected TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + if (tEnv == null) { + EnvironmentSettings settings = + EnvironmentSettings.newInstance().inStreamingMode().build(); + + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(400) + .setMaxParallelism(1) + .setParallelism(1); + + tEnv = StreamTableEnvironment.create(env, settings); + } + } + } + return tEnv; + } + + protected static Row insertRow(Object... values) { + return Row.ofKind(RowKind.INSERT, values); + } + + protected static Row deleteRow(Object... values) { + return Row.ofKind(RowKind.DELETE, values); + } + + protected static Row updateBeforeRow(Object... values) { + return Row.ofKind(RowKind.UPDATE_BEFORE, values); + } + + protected static Row updateAfterRow(Object... values) { + return Row.ofKind(RowKind.UPDATE_AFTER, values); + } + + protected static List listJoin(List> lists) { + return lists.stream().flatMap(List::stream).collect(Collectors.toList()); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java new file mode 100644 index 000000000000..540902f3cea5 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.spy; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.BaseCombinedScanTask; +import org.apache.iceberg.BaseFileScanTask; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileMetadata; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.PartitionSpecParser; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.expressions.ResidualEvaluator; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.hadoop.HadoopCatalog; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.ThreadPools; + +public class SplitHelpers { + + private SplitHelpers() {} + + /** + * This create a list of IcebergSourceSplit from real files + *

  • Create a new Hadoop table under the {@code temporaryFolder} + *
  • write {@code fileCount} number of files to the new Iceberg table + *
  • Discover the splits from the table and partition the splits by the {@code filePerSplit} + * limit + *
  • Delete the Hadoop table + * + *

    Since the table and data files are deleted before this method return, caller shouldn't + * attempt to read the data files. + * + *

    By default, v1 Iceberg table is created. For v2 table use {@link + * SplitHelpers#createSplitsFromTransientHadoopTable(Path, int, int, String)} + * + * @param temporaryFolder Folder to place the data to + * @param fileCount The number of files to create and add to the table + * @param filesPerSplit The number of files used for a split + */ + public static List createSplitsFromTransientHadoopTable( + Path temporaryFolder, int fileCount, int filesPerSplit) throws Exception { + return createSplitsFromTransientHadoopTable(temporaryFolder, fileCount, filesPerSplit, "1"); + } + + /** + * This create a list of IcebergSourceSplit from real files + *

  • Create a new Hadoop table under the {@code temporaryFolder} + *
  • write {@code fileCount} number of files to the new Iceberg table + *
  • Discover the splits from the table and partition the splits by the {@code filePerSplit} + * limit + *
  • Delete the Hadoop table + * + *

    Since the table and data files are deleted before this method return, caller shouldn't + * attempt to read the data files. + * + * @param temporaryFolder Folder to place the data to + * @param fileCount The number of files to create and add to the table + * @param filesPerSplit The number of files used for a split + * @param version The table version to create + */ + public static List createSplitsFromTransientHadoopTable( + Path temporaryFolder, int fileCount, int filesPerSplit, String version) throws Exception { + final File warehouseFile = File.createTempFile("junit", null, temporaryFolder.toFile()); + assertThat(warehouseFile.delete()).isTrue(); + final String warehouse = "file:" + warehouseFile; + Configuration hadoopConf = new Configuration(); + final HadoopCatalog catalog = new HadoopCatalog(hadoopConf, warehouse); + ImmutableMap properties = + ImmutableMap.of(TableProperties.FORMAT_VERSION, version); + try { + final Table table = + catalog.createTable( + TestFixtures.TABLE_IDENTIFIER, + TestFixtures.SCHEMA, + PartitionSpec.unpartitioned(), + null, + properties); + final GenericAppenderHelper dataAppender = + new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); + for (int i = 0; i < fileCount; ++i) { + List records = RandomGenericData.generate(TestFixtures.SCHEMA, 2, i); + dataAppender.appendToTable(records); + } + + final ScanContext scanContext = ScanContext.builder().build(); + final List splits = + FlinkSplitPlanner.planIcebergSourceSplits( + table, scanContext, ThreadPools.getWorkerPool()); + return splits.stream() + .flatMap( + split -> { + List> filesList = + Lists.partition(Lists.newArrayList(split.task().files()), filesPerSplit); + return filesList.stream() + .map(files -> new BaseCombinedScanTask(files)) + .map( + combinedScanTask -> + IcebergSourceSplit.fromCombinedScanTask(combinedScanTask)); + }) + .collect(Collectors.toList()); + } finally { + catalog.dropTable(TestFixtures.TABLE_IDENTIFIER); + catalog.close(); + } + } + + /** + * This method will equip the {@code icebergSourceSplits} with mock delete files. + *

  • For each split, create {@code deleteFilesPerSplit} number of delete files + *
  • Replace the original {@code FileScanTask} with the new {@code FileScanTask} with mock + *
  • Caller should not attempt to read the deleted files since they are created as mock, and + * they are not real files + * + * @param icebergSourceSplits The real splits to equip with mock delete files + * @param temporaryFolder The temporary folder to create the mock delete files with + * @param deleteFilesPerSplit The number of delete files to create for each split + * @return The list of re-created splits with mock delete files + * @throws IOException If there is any error creating the mock delete files + */ + public static List equipSplitsWithMockDeleteFiles( + List icebergSourceSplits, Path temporaryFolder, int deleteFilesPerSplit) + throws IOException { + List icebergSourceSplitsWithMockDeleteFiles = Lists.newArrayList(); + for (IcebergSourceSplit split : icebergSourceSplits) { + final CombinedScanTask combinedScanTask = spy(split.task()); + + final List deleteFiles = Lists.newArrayList(); + final PartitionSpec spec = + PartitionSpec.builderFor(TestFixtures.SCHEMA).withSpecId(0).build(); + + for (int i = 0; i < deleteFilesPerSplit; ++i) { + final DeleteFile deleteFile = + FileMetadata.deleteFileBuilder(spec) + .withFormat(FileFormat.PARQUET) + .withPath(File.createTempFile("junit", null, temporaryFolder.toFile()).getPath()) + .ofPositionDeletes() + .withFileSizeInBytes(1000) + .withRecordCount(1000) + .build(); + deleteFiles.add(deleteFile); + } + + List newFileScanTasks = Lists.newArrayList(); + for (FileScanTask task : combinedScanTask.tasks()) { + String schemaString = SchemaParser.toJson(task.schema()); + String specString = PartitionSpecParser.toJson(task.spec()); + + BaseFileScanTask baseFileScanTask = + new BaseFileScanTask( + task.file(), + deleteFiles.toArray(new DeleteFile[] {}), + schemaString, + specString, + ResidualEvaluator.unpartitioned(task.residual())); + newFileScanTasks.add(baseFileScanTask); + } + doReturn(newFileScanTasks).when(combinedScanTask).tasks(); + icebergSourceSplitsWithMockDeleteFiles.add( + IcebergSourceSplit.fromCombinedScanTask( + combinedScanTask, split.fileOffset(), split.recordOffset())); + } + return icebergSourceSplitsWithMockDeleteFiles; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java new file mode 100644 index 000000000000..e4e48ca67f66 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.util.List; +import java.util.Map; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +public class SqlHelpers { + private SqlHelpers() {} + + public static List sql(TableEnvironment tableEnv, String query, Object... args) { + TableResult tableResult = tableEnv.executeSql(String.format(query, args)); + try (CloseableIterator iter = tableResult.collect()) { + List results = Lists.newArrayList(iter); + return results; + } catch (Exception e) { + throw new RuntimeException("Failed to collect table result", e); + } + } + + public static String sqlOptionsToString(Map sqlOptions) { + StringBuilder builder = new StringBuilder(); + sqlOptions.forEach((key, value) -> builder.append(optionToKv(key, value)).append(",")); + String optionStr = builder.toString(); + if (optionStr.endsWith(",")) { + optionStr = optionStr.substring(0, optionStr.length() - 1); + } + + if (!optionStr.isEmpty()) { + optionStr = String.format("/*+ OPTIONS(%s)*/", optionStr); + } + + return optionStr; + } + + private static String optionToKv(String key, Object value) { + return "'" + key + "'='" + value + "'"; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java new file mode 100644 index 000000000000..c83a9a1baa15 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.events.Listeners; +import org.apache.iceberg.events.ScanEvent; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.TestBase; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public abstract class TableSourceTestBase extends TestBase { + @Parameters(name = "useFlip27Source = {0}") + protected static Object[][] parameters() { + return new Object[][] { + {false}, {true}, + }; + } + + @Parameter(index = 0) + protected boolean useFlip27Source; + + protected static final String CATALOG_NAME = "test_catalog"; + protected static final String DATABASE_NAME = "test_db"; + protected static final String TABLE_NAME = "test_table"; + protected final FileFormat format = FileFormat.AVRO; + protected int scanEventCount = 0; + protected ScanEvent lastScanEvent = null; + + @Override + protected TableEnvironment getTableEnv() { + super.getTableEnv().getConfig().getConfiguration().set(CoreOptions.DEFAULT_PARALLELISM, 1); + super.getTableEnv() + .getConfig() + .getConfiguration() + .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE, useFlip27Source); + return super.getTableEnv(); + } + + @BeforeEach + public void before() throws IOException { + // register a scan event listener to validate pushdown + Listeners.register( + event -> { + scanEventCount += 1; + lastScanEvent = event; + }, + ScanEvent.class); + + File warehouseFile = File.createTempFile("junit", null, temporaryDirectory.toFile()); + assertThat(warehouseFile.delete()).isTrue(); + String warehouse = String.format("file:%s", warehouseFile); + + sql( + "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + CATALOG_NAME, warehouse); + sql("USE CATALOG %s", CATALOG_NAME); + sql("CREATE DATABASE %s", DATABASE_NAME); + sql("USE %s", DATABASE_NAME); + sql( + "CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) WITH ('write.format.default'='%s')", + TABLE_NAME, format.name()); + sql( + "INSERT INTO %s VALUES (1,'iceberg',10),(2,'b',20),(3,CAST(NULL AS VARCHAR),30)", + TABLE_NAME); + + this.scanEventCount = 0; + this.lastScanEvent = null; + } + + @AfterEach + public void clean() { + sql("DROP TABLE IF EXISTS %s.%s", DATABASE_NAME, TABLE_NAME); + dropDatabase(DATABASE_NAME, true); + dropCatalog(CATALOG_NAME, true); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java new file mode 100644 index 000000000000..bde751e1f87f --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; +import org.apache.flink.types.Row; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Streams; +import org.junit.jupiter.api.Test; + +public class TestBoundedTableFactory extends ChangeLogTableTestBase { + + @Test + public void testEmptyDataSet() { + List> emptyDataSet = ImmutableList.of(); + + String dataId = BoundedTableFactory.registerDataSet(emptyDataSet); + sql( + "CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", + tableName, dataId); + + assertThat(sql("SELECT * FROM %s", tableName)).isEmpty(); + } + + @Test + public void testBoundedTableFactory() { + List> dataSet = + ImmutableList.of( + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(1, "bbb"), + insertRow(2, "aaa"), + deleteRow(2, "aaa"), + insertRow(2, "bbb")), + ImmutableList.of( + updateBeforeRow(2, "bbb"), + updateAfterRow(2, "ccc"), + deleteRow(2, "ccc"), + insertRow(2, "ddd")), + ImmutableList.of( + deleteRow(1, "bbb"), + insertRow(1, "ccc"), + deleteRow(1, "ccc"), + insertRow(1, "ddd"))); + + String dataId = BoundedTableFactory.registerDataSet(dataSet); + sql( + "CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", + tableName, dataId); + + List rowSet = dataSet.stream().flatMap(Streams::stream).collect(Collectors.toList()); + assertThat(sql("SELECT * FROM %s", tableName)).isEqualTo(rowSet); + + assertThat(sql("SELECT * FROM %s WHERE data='aaa'", tableName)) + .isEqualTo( + rowSet.stream() + .filter(r -> Objects.equals(r.getField(1), "aaa")) + .collect(Collectors.toList())); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java new file mode 100644 index 000000000000..d11bb8640412 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.apache.iceberg.flink.SimpleDataUtil.SCHEMA; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.legacy.api.TableSchema; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.TestTemplate; + +/** Test {@link FlinkInputFormat}. */ +public class TestFlinkInputFormat extends TestFlinkSource { + + @Override + protected List run( + FlinkSource.Builder formatBuilder, + Map sqlOptions, + String sqlFilter, + String... sqlSelectedFields) + throws Exception { + return runFormat(formatBuilder.tableLoader(tableLoader()).buildFormat()); + } + + @TestTemplate + public void testNestedProjection() throws Exception { + Schema schema = + new Schema( + required(1, "data", Types.StringType.get()), + required( + 2, + "nested", + Types.StructType.of( + Types.NestedField.required(3, "f1", Types.StringType.get()), + Types.NestedField.required(4, "f2", Types.StringType.get()), + Types.NestedField.required(5, "f3", Types.LongType.get()))), + required(6, "id", Types.LongType.get())); + + Table table = + CATALOG_EXTENSION.catalog().createTable(TableIdentifier.of("default", "t"), schema); + + List writeRecords = RandomGenericData.generate(schema, 2, 0L); + new GenericAppenderHelper(table, fileFormat, temporaryDirectory).appendToTable(writeRecords); + + // Schema: [data, nested[f1, f2, f3], id] + // Projection: [nested.f2, data] + // The Flink SQL output: [f2, data] + // The FlinkInputFormat output: [nested[f2], data] + + TableSchema projectedSchema = + TableSchema.builder() + .field("nested", DataTypes.ROW(DataTypes.FIELD("f2", DataTypes.STRING()))) + .field("data", DataTypes.STRING()) + .build(); + List result = + runFormat( + FlinkSource.forRowData() + .tableLoader(tableLoader()) + .project(projectedSchema) + .buildFormat()); + + List expected = Lists.newArrayList(); + for (Record record : writeRecords) { + Row nested = Row.of(((Record) record.get(1)).get(1)); + expected.add(Row.of(nested, record.get(0))); + } + + TestHelpers.assertRows(result, expected); + } + + @TestTemplate + public void testBasicProjection() throws IOException { + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.optional(2, "time", Types.TimestampType.withZone())); + + Table table = + CATALOG_EXTENSION.catalog().createTable(TableIdentifier.of("default", "t"), writeSchema); + + List writeRecords = RandomGenericData.generate(writeSchema, 2, 0L); + new GenericAppenderHelper(table, fileFormat, temporaryDirectory).appendToTable(writeRecords); + + TableSchema projectedSchema = + TableSchema.builder() + .field("id", DataTypes.BIGINT()) + .field("data", DataTypes.STRING()) + .build(); + List result = + runFormat( + FlinkSource.forRowData() + .tableLoader(tableLoader()) + .project(projectedSchema) + .buildFormat()); + + List expected = Lists.newArrayList(); + for (Record record : writeRecords) { + expected.add(Row.of(record.get(0), record.get(1))); + } + + TestHelpers.assertRows(result, expected); + } + + @TestTemplate + public void testReadPartitionColumn() throws Exception { + assumeThat(fileFormat).as("Temporary skip ORC").isNotEqualTo(FileFormat.ORC); + + Schema nestedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional( + 2, + "struct", + Types.StructType.of( + Types.NestedField.optional(3, "innerId", Types.LongType.get()), + Types.NestedField.optional(4, "innerName", Types.StringType.get())))); + PartitionSpec spec = + PartitionSpec.builderFor(nestedSchema).identity("struct.innerName").build(); + + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, nestedSchema, spec); + List records = RandomGenericData.generate(nestedSchema, 10, 0L); + GenericAppenderHelper appender = + new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + for (Record record : records) { + org.apache.iceberg.TestHelpers.Row partition = + org.apache.iceberg.TestHelpers.Row.of(record.get(1, Record.class).get(1)); + appender.appendToTable(partition, Collections.singletonList(record)); + } + + TableSchema projectedSchema = + TableSchema.builder() + .field("struct", DataTypes.ROW(DataTypes.FIELD("innerName", DataTypes.STRING()))) + .build(); + List result = + runFormat( + FlinkSource.forRowData() + .tableLoader(tableLoader()) + .project(projectedSchema) + .buildFormat()); + + List expected = Lists.newArrayList(); + for (Record record : records) { + Row nested = Row.of(((Record) record.get(1)).get(1)); + expected.add(Row.of(nested)); + } + + TestHelpers.assertRows(result, expected); + } + + @TestTemplate + public void testValidation() { + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, SCHEMA); + + assertThatThrownBy( + () -> + FlinkSource.forRowData() + .env(StreamExecutionEnvironment.getExecutionEnvironment()) + .tableLoader(tableLoader()) + .streaming(false) + .endTag("tag") + .endSnapshotId(1L) + .build()) + .hasMessage("END_SNAPSHOT_ID and END_TAG cannot both be set.") + .isInstanceOf(IllegalArgumentException.class); + } + + private List runFormat(FlinkInputFormat inputFormat) throws IOException { + RowType rowType = FlinkSchemaUtil.convert(inputFormat.projectedSchema()); + return TestHelpers.readRows(inputFormat, rowType); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java new file mode 100644 index 000000000000..1b4fc863631f --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.util.Map; +import org.apache.flink.table.types.logical.RowType; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.CatalogLoader; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.StructLikeSet; + +public class TestFlinkInputFormatReaderDeletes extends TestFlinkReaderDeletesBase { + + @Override + protected StructLikeSet rowSet(String tableName, Table testTable, String... columns) + throws IOException { + Schema projected = testTable.schema().select(columns); + RowType rowType = FlinkSchemaUtil.convert(projected); + Map properties = Maps.newHashMap(); + properties.put( + CatalogProperties.WAREHOUSE_LOCATION, + hiveConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); + properties.put(CatalogProperties.URI, hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname)); + properties.put( + CatalogProperties.CLIENT_POOL_SIZE, + Integer.toString(hiveConf.getInt("iceberg.hive.client-pool-size", 5))); + CatalogLoader hiveCatalogLoader = CatalogLoader.hive(catalog.name(), hiveConf, properties); + FlinkInputFormat inputFormat = + FlinkSource.forRowData() + .tableLoader( + TableLoader.fromCatalog( + hiveCatalogLoader, TableIdentifier.of("default", tableName))) + .project(FlinkSchemaUtil.toSchema(rowType)) + .buildFormat(); + + StructLikeSet set = StructLikeSet.create(projected.asStruct()); + TestHelpers.readRowData(inputFormat, rowType) + .forEach( + rowData -> { + RowDataWrapper wrapper = new RowDataWrapper(rowType, projected.asStruct()); + set.add(wrapper.wrap(rowData)); + }); + + return set; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java new file mode 100644 index 000000000000..59a4c3118cdf --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Files; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Table; +import org.apache.iceberg.TestMergingMetrics; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.RowDataConverter; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.sink.FlinkAppenderFactory; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.extension.RegisterExtension; + +public class TestFlinkMergingMetrics extends TestMergingMetrics { + + @RegisterExtension + private static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension("test_db", "test_table"); + + @Override + protected FileAppender writeAndGetAppender(List records) throws IOException { + Table table = CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, SCHEMA); + RowType flinkSchema = FlinkSchemaUtil.convert(SCHEMA); + FileAppender appender = + new FlinkAppenderFactory( + table, + SCHEMA, + flinkSchema, + ImmutableMap.of(), + PartitionSpec.unpartitioned(), + null, + null, + null) + .newAppender( + Files.localOutput(File.createTempFile("junit", null, tempDir)), fileFormat); + try (FileAppender fileAppender = appender) { + records.stream().map(r -> RowDataConverter.convert(SCHEMA, r)).forEach(fileAppender::add); + } + return appender; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java new file mode 100644 index 000000000000..8352924d042a --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java @@ -0,0 +1,813 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.time.Instant; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.apache.avro.generic.GenericData; +import org.apache.commons.collections.ListUtils; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.types.Row; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileContent; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Files; +import org.apache.iceberg.HasTableOperations; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.MetadataTableUtils; +import org.apache.iceberg.MetricsUtil; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.FileHelpers; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.CatalogTestBase; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.SnapshotUtil; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.io.TempDir; + +public class TestFlinkMetaDataTable extends CatalogTestBase { + private static final String TABLE_NAME = "test_table"; + private final FileFormat format = FileFormat.AVRO; + private @TempDir Path temp; + + @Parameter(index = 2) + private Boolean isPartition; + + @Parameters(name = "catalogName={0}, baseNamespace={1}, isPartition={2}") + protected static List parameters() { + List parameters = Lists.newArrayList(); + + for (Boolean isPartition : new Boolean[] {true, false}) { + String catalogName = "testhadoop"; + Namespace baseNamespace = Namespace.of("default"); + parameters.add(new Object[] {catalogName, baseNamespace, isPartition}); + } + return parameters; + } + + @Override + protected TableEnvironment getTableEnv() { + Configuration configuration = super.getTableEnv().getConfig().getConfiguration(); + configuration.set(CoreOptions.DEFAULT_PARALLELISM, 1); + return super.getTableEnv(); + } + + @BeforeEach + public void before() { + super.before(); + sql("USE CATALOG %s", catalogName); + sql("CREATE DATABASE %s", flinkDatabase); + sql("USE %s", DATABASE); + if (isPartition) { + sql( + "CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) PARTITIONED BY (data) WITH ('format-version'='2', 'write.format.default'='%s')", + TABLE_NAME, format.name()); + sql("INSERT INTO %s VALUES (1,'a',10),(2,'a',20)", TABLE_NAME); + sql("INSERT INTO %s VALUES (1,'b',10),(2,'b',20)", TABLE_NAME); + } else { + sql( + "CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) WITH ('format-version'='2', 'write.format.default'='%s')", + TABLE_NAME, format.name()); + sql( + "INSERT INTO %s VALUES (1,'iceberg',10),(2,'b',20),(3,CAST(NULL AS VARCHAR),30)", + TABLE_NAME); + sql("INSERT INTO %s VALUES (4,'iceberg',10)", TABLE_NAME); + } + } + + @Override + @AfterEach + public void clean() { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME); + dropDatabase(flinkDatabase, true); + super.clean(); + } + + @TestTemplate + public void testSnapshots() { + String sql = String.format("SELECT * FROM %s$snapshots ", TABLE_NAME); + List result = sql(sql); + + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + Iterator snapshots = table.snapshots().iterator(); + for (Row row : result) { + Snapshot next = snapshots.next(); + assertThat(((Instant) row.getField(0)).toEpochMilli()) + .as("Should have expected timestamp") + .isEqualTo(next.timestampMillis()); + assertThat(next.snapshotId()) + .as("Should have expected snapshot id") + .isEqualTo(next.snapshotId()); + assertThat(row.getField(2)).as("Should have expected parent id").isEqualTo(next.parentId()); + assertThat(row.getField(3)).as("Should have expected operation").isEqualTo(next.operation()); + assertThat(row.getField(4)) + .as("Should have expected manifest list location") + .isEqualTo(next.manifestListLocation()); + assertThat(row.getField(5)).as("Should have expected summary").isEqualTo(next.summary()); + } + } + + @TestTemplate + public void testHistory() { + String sql = String.format("SELECT * FROM %s$history ", TABLE_NAME); + List result = sql(sql); + + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + Iterator snapshots = table.snapshots().iterator(); + for (Row row : result) { + Snapshot next = snapshots.next(); + assertThat(((Instant) row.getField(0)).toEpochMilli()) + .as("Should have expected made_current_at") + .isEqualTo(next.timestampMillis()); + assertThat(row.getField(1)) + .as("Should have expected snapshot id") + .isEqualTo(next.snapshotId()); + assertThat(row.getField(2)).as("Should have expected parent id").isEqualTo(next.parentId()); + assertThat(row.getField(3)) + .as("Should have expected is current ancestor") + .isEqualTo( + SnapshotUtil.isAncestorOf( + table, table.currentSnapshot().snapshotId(), next.snapshotId())); + } + } + + @TestTemplate + public void testManifests() { + String sql = String.format("SELECT * FROM %s$manifests ", TABLE_NAME); + List result = sql(sql); + + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + List expectedDataManifests = dataManifests(table); + + for (int i = 0; i < result.size(); i++) { + Row row = result.get(i); + ManifestFile manifestFile = expectedDataManifests.get(i); + assertThat(row.getField(0)) + .as("Should have expected content") + .isEqualTo(manifestFile.content().id()); + assertThat(row.getField(1)).as("Should have expected path").isEqualTo(manifestFile.path()); + assertThat(row.getField(2)) + .as("Should have expected length") + .isEqualTo(manifestFile.length()); + assertThat(row.getField(3)) + .as("Should have expected partition_spec_id") + .isEqualTo(manifestFile.partitionSpecId()); + assertThat(row.getField(4)) + .as("Should have expected added_snapshot_id") + .isEqualTo(manifestFile.snapshotId()); + assertThat(row.getField(5)) + .as("Should have expected added_data_files_count") + .isEqualTo(manifestFile.addedFilesCount()); + assertThat(row.getField(6)) + .as("Should have expected existing_data_files_count") + .isEqualTo(manifestFile.existingFilesCount()); + assertThat(row.getField(7)) + .as("Should have expected deleted_data_files_count") + .isEqualTo(manifestFile.deletedFilesCount()); + } + } + + @TestTemplate + public void testAllManifests() { + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + + String sql = String.format("SELECT * FROM %s$all_manifests ", TABLE_NAME); + List result = sql(sql); + + List expectedDataManifests = allDataManifests(table); + + assertThat(expectedDataManifests).hasSize(result.size()); + for (int i = 0; i < result.size(); i++) { + Row row = result.get(i); + ManifestFile manifestFile = expectedDataManifests.get(i); + assertThat(row.getField(0)) + .as("Should have expected content") + .isEqualTo(manifestFile.content().id()); + assertThat(row.getField(1)).as("Should have expected path").isEqualTo(manifestFile.path()); + assertThat(row.getField(2)) + .as("Should have expected length") + .isEqualTo(manifestFile.length()); + assertThat(row.getField(3)) + .as("Should have expected partition_spec_id") + .isEqualTo(manifestFile.partitionSpecId()); + assertThat(row.getField(4)) + .as("Should have expected added_snapshot_id") + .isEqualTo(manifestFile.snapshotId()); + assertThat(row.getField(5)) + .as("Should have expected added_data_files_count") + .isEqualTo(manifestFile.addedFilesCount()); + assertThat(row.getField(6)) + .as("Should have expected existing_data_files_count") + .isEqualTo(manifestFile.existingFilesCount()); + assertThat(row.getField(7)) + .as("Should have expected deleted_data_files_count") + .isEqualTo(manifestFile.deletedFilesCount()); + } + } + + @TestTemplate + public void testUnPartitionedTable() throws IOException { + assumeThat(isPartition).isFalse(); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + + Schema deleteRowSchema = table.schema().select("id"); + Record dataDelete = GenericRecord.create(deleteRowSchema); + List dataDeletes = Lists.newArrayList(dataDelete.copy("id", 1)); + File testFile = File.createTempFile("junit", null, temp.toFile()); + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(testFile), dataDeletes, deleteRowSchema); + table.newRowDelta().addDeletes(eqDeletes).commit(); + + List expectedDataManifests = dataManifests(table); + List expectedDeleteManifests = deleteManifests(table); + + assertThat(expectedDataManifests).hasSize(2); + assertThat(expectedDeleteManifests).hasSize(1); + + Schema entriesTableSchema = + MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("entries")) + .schema(); + + // check delete files table + Schema deleteFilesTableSchema = + MetadataTableUtils.createMetadataTableInstance( + table, MetadataTableType.from("delete_files")) + .schema(); + + List deleteColumns = + deleteFilesTableSchema.columns().stream() + .map(Types.NestedField::name) + .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) + .collect(Collectors.toList()); + String deleteNames = + deleteColumns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); + + deleteFilesTableSchema = deleteFilesTableSchema.select(deleteColumns); + + List actualDeleteFiles = sql("SELECT %s FROM %s$delete_files", deleteNames, TABLE_NAME); + assertThat(actualDeleteFiles).hasSize(1); + assertThat(expectedDeleteManifests).as("Should have 1 delete manifest").hasSize(1); + + List expectedDeleteFiles = + expectedEntries( + table, FileContent.EQUALITY_DELETES, entriesTableSchema, expectedDeleteManifests, null); + assertThat(expectedDeleteFiles).as("Should be 1 delete file manifest entry").hasSize(1); + TestHelpers.assertEquals( + deleteFilesTableSchema, expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); + + // Check data files table + Schema filesTableSchema = + MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("files")) + .schema(); + + List columns = + filesTableSchema.columns().stream() + .map(Types.NestedField::name) + .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) + .collect(Collectors.toList()); + String names = columns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); + + filesTableSchema = filesTableSchema.select(columns); + + List actualDataFiles = sql("SELECT %s FROM %s$data_files", names, TABLE_NAME); + assertThat(actualDataFiles).as("Metadata table should return 2 data file").hasSize(2); + List expectedDataFiles = + expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, null); + assertThat(expectedDataFiles).as("Should be 2 data file manifest entry").hasSize(2); + TestHelpers.assertEquals(filesTableSchema, expectedDataFiles.get(0), actualDataFiles.get(0)); + + // check all files table + List actualFiles = sql("SELECT %s FROM %s$files ORDER BY content", names, TABLE_NAME); + assertThat(actualFiles).as("Metadata table should return 3 files").hasSize(3); + List expectedFiles = + Stream.concat(expectedDataFiles.stream(), expectedDeleteFiles.stream()) + .collect(Collectors.toList()); + assertThat(expectedFiles).as("Should have 3 files manifest entriess").hasSize(3); + TestHelpers.assertEquals(filesTableSchema, expectedFiles.get(0), actualFiles.get(0)); + TestHelpers.assertEquals(filesTableSchema, expectedFiles.get(1), actualFiles.get(1)); + } + + @TestTemplate + public void testPartitionedTable() throws Exception { + assumeThat(isPartition).isTrue(); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + + Schema deleteRowSchema = table.schema().select("id", "data"); + Record dataDelete = GenericRecord.create(deleteRowSchema); + + Map deleteRow = Maps.newHashMap(); + deleteRow.put("id", 1); + deleteRow.put("data", "a"); + File testFile = File.createTempFile("junit", null, temp.toFile()); + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(testFile), + org.apache.iceberg.TestHelpers.Row.of("a"), + Lists.newArrayList(dataDelete.copy(deleteRow)), + deleteRowSchema); + table.newRowDelta().addDeletes(eqDeletes).commit(); + + deleteRow.put("data", "b"); + File testFile2 = File.createTempFile("junit", null, temp.toFile()); + DeleteFile eqDeletes2 = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(testFile2), + org.apache.iceberg.TestHelpers.Row.of("b"), + Lists.newArrayList(dataDelete.copy(deleteRow)), + deleteRowSchema); + table.newRowDelta().addDeletes(eqDeletes2).commit(); + + Schema entriesTableSchema = + MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("entries")) + .schema(); + + List expectedDataManifests = dataManifests(table); + List expectedDeleteManifests = deleteManifests(table); + + assertThat(expectedDataManifests).hasSize(2); + assertThat(expectedDeleteManifests).hasSize(2); + Table deleteFilesTable = + MetadataTableUtils.createMetadataTableInstance( + table, MetadataTableType.from("delete_files")); + Schema filesTableSchema = deleteFilesTable.schema(); + + List columns = + filesTableSchema.columns().stream() + .map(Types.NestedField::name) + .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) + .collect(Collectors.toList()); + String names = columns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); + + filesTableSchema = filesTableSchema.select(columns); + + // Check delete files table + List expectedDeleteFiles = + expectedEntries( + table, FileContent.EQUALITY_DELETES, entriesTableSchema, expectedDeleteManifests, "a"); + assertThat(expectedDeleteFiles).hasSize(1); + List actualDeleteFiles = + sql("SELECT %s FROM %s$delete_files WHERE `partition`.`data`='a'", names, TABLE_NAME); + + assertThat(actualDeleteFiles).hasSize(1); + TestHelpers.assertEquals( + filesTableSchema, expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); + + // Check data files table + List expectedDataFiles = + expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, "a"); + assertThat(expectedDataFiles).hasSize(1); + List actualDataFiles = + sql("SELECT %s FROM %s$data_files WHERE `partition`.`data`='a'", names, TABLE_NAME); + assertThat(actualDataFiles).hasSize(1); + TestHelpers.assertEquals(filesTableSchema, expectedDataFiles.get(0), actualDataFiles.get(0)); + + List actualPartitionsWithProjection = + sql("SELECT file_count FROM %s$partitions ", TABLE_NAME); + assertThat(actualPartitionsWithProjection).hasSize(2); + for (int i = 0; i < 2; ++i) { + assertThat(actualPartitionsWithProjection.get(i).getField(0)).isEqualTo(1); + } + + // Check files table + List expectedFiles = + Stream.concat(expectedDataFiles.stream(), expectedDeleteFiles.stream()) + .collect(Collectors.toList()); + assertThat(expectedFiles).hasSize(2); + List actualFiles = + sql( + "SELECT %s FROM %s$files WHERE `partition`.`data`='a' ORDER BY content", + names, TABLE_NAME); + assertThat(actualFiles).hasSize(2); + TestHelpers.assertEquals(filesTableSchema, expectedFiles.get(0), actualFiles.get(0)); + TestHelpers.assertEquals(filesTableSchema, expectedFiles.get(1), actualFiles.get(1)); + } + + @TestTemplate + public void testAllFilesUnpartitioned() throws Exception { + assumeThat(isPartition).isFalse(); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + + Schema deleteRowSchema = table.schema().select("id", "data"); + Record dataDelete = GenericRecord.create(deleteRowSchema); + + Map deleteRow = Maps.newHashMap(); + deleteRow.put("id", 1); + File testFile = File.createTempFile("junit", null, temp.toFile()); + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(testFile), + Lists.newArrayList(dataDelete.copy(deleteRow)), + deleteRowSchema); + table.newRowDelta().addDeletes(eqDeletes).commit(); + + List expectedDataManifests = dataManifests(table); + assertThat(expectedDataManifests).hasSize(2); + List expectedDeleteManifests = deleteManifests(table); + assertThat(expectedDeleteManifests).hasSize(1); + + // Clear table to test whether 'all_files' can read past files + table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); + + Schema entriesTableSchema = + MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("entries")) + .schema(); + Schema filesTableSchema = + MetadataTableUtils.createMetadataTableInstance( + table, MetadataTableType.from("all_data_files")) + .schema(); + + List columns = + filesTableSchema.columns().stream() + .map(Types.NestedField::name) + .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) + .collect(Collectors.toList()); + String names = columns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); + + filesTableSchema = filesTableSchema.select(columns); + + // Check all data files table + List actualDataFiles = + sql("SELECT %s FROM %s$all_data_files order by record_count ", names, TABLE_NAME); + + List expectedDataFiles = + expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, null); + assertThat(expectedDataFiles).hasSize(2); + assertThat(actualDataFiles).hasSize(2); + TestHelpers.assertEquals(filesTableSchema, expectedDataFiles, actualDataFiles); + + // Check all delete files table + List actualDeleteFiles = sql("SELECT %s FROM %s$all_delete_files", names, TABLE_NAME); + List expectedDeleteFiles = + expectedEntries( + table, FileContent.EQUALITY_DELETES, entriesTableSchema, expectedDeleteManifests, null); + assertThat(expectedDeleteFiles).hasSize(1); + assertThat(actualDeleteFiles).hasSize(1); + TestHelpers.assertEquals( + filesTableSchema, expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); + + // Check all files table + List actualFiles = + sql("SELECT %s FROM %s$all_files ORDER BY content, record_count asc", names, TABLE_NAME); + List expectedFiles = + ListUtils.union(expectedDataFiles, expectedDeleteFiles); + expectedFiles.sort(Comparator.comparing(r -> ((Integer) r.get("content")))); + assertThat(actualFiles).hasSize(3); + TestHelpers.assertEquals(filesTableSchema, expectedFiles, actualFiles); + } + + @TestTemplate + public void testAllFilesPartitioned() throws Exception { + assumeThat(!isPartition).isFalse(); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + + // Create delete file + Schema deleteRowSchema = table.schema().select("id"); + Record dataDelete = GenericRecord.create(deleteRowSchema); + + Map deleteRow = Maps.newHashMap(); + deleteRow.put("id", 1); + File testFile = File.createTempFile("junit", null, temp.toFile()); + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(testFile), + org.apache.iceberg.TestHelpers.Row.of("a"), + Lists.newArrayList(dataDelete.copy(deleteRow)), + deleteRowSchema); + File testFile2 = File.createTempFile("junit", null, temp.toFile()); + DeleteFile eqDeletes2 = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(testFile2), + org.apache.iceberg.TestHelpers.Row.of("b"), + Lists.newArrayList(dataDelete.copy(deleteRow)), + deleteRowSchema); + table.newRowDelta().addDeletes(eqDeletes).addDeletes(eqDeletes2).commit(); + + List expectedDataManifests = dataManifests(table); + assertThat(expectedDataManifests).hasSize(2); + List expectedDeleteManifests = deleteManifests(table); + assertThat(expectedDeleteManifests).hasSize(1); + // Clear table to test whether 'all_files' can read past files + table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); + + Schema entriesTableSchema = + MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("entries")) + .schema(); + Schema filesTableSchema = + MetadataTableUtils.createMetadataTableInstance( + table, MetadataTableType.from("all_data_files")) + .schema(); + + List columns = + filesTableSchema.columns().stream() + .map(Types.NestedField::name) + .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) + .collect(Collectors.toList()); + String names = columns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); + + filesTableSchema = filesTableSchema.select(columns); + + // Check all data files table + List actualDataFiles = + sql("SELECT %s FROM %s$all_data_files WHERE `partition`.`data`='a'", names, TABLE_NAME); + List expectedDataFiles = + expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, "a"); + assertThat(expectedDataFiles).hasSize(1); + assertThat(actualDataFiles).hasSize(1); + TestHelpers.assertEquals(filesTableSchema, expectedDataFiles.get(0), actualDataFiles.get(0)); + + // Check all delete files table + List actualDeleteFiles = + sql("SELECT %s FROM %s$all_delete_files WHERE `partition`.`data`='a'", names, TABLE_NAME); + List expectedDeleteFiles = + expectedEntries( + table, FileContent.EQUALITY_DELETES, entriesTableSchema, expectedDeleteManifests, "a"); + assertThat(expectedDeleteFiles).hasSize(1); + assertThat(actualDeleteFiles).hasSize(1); + TestHelpers.assertEquals( + filesTableSchema, expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); + + // Check all files table + List actualFiles = + sql( + "SELECT %s FROM %s$all_files WHERE `partition`.`data`='a' ORDER BY content", + names, TABLE_NAME); + List expectedFiles = + ListUtils.union(expectedDataFiles, expectedDeleteFiles); + expectedFiles.sort(Comparator.comparing(r -> ((Integer) r.get("content")))); + assertThat(actualFiles).hasSize(2); + TestHelpers.assertEquals(filesTableSchema, expectedFiles, actualFiles); + } + + @TestTemplate + public void testMetadataLogEntries() { + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + + Long currentSnapshotId = table.currentSnapshot().snapshotId(); + TableMetadata tableMetadata = ((HasTableOperations) table).operations().current(); + Snapshot currentSnapshot = tableMetadata.currentSnapshot(); + Snapshot parentSnapshot = table.snapshot(currentSnapshot.parentId()); + List metadataLogEntries = + Lists.newArrayList(tableMetadata.previousFiles()); + + // Check metadataLog table + List metadataLogs = sql("SELECT * FROM %s$metadata_log_entries", TABLE_NAME); + + assertThat(metadataLogs).hasSize(3); + Row metadataLog = metadataLogs.get(0); + assertThat(metadataLog.getField("timestamp")) + .isEqualTo(Instant.ofEpochMilli(metadataLogEntries.get(0).timestampMillis())); + assertThat(metadataLog.getField("file")).isEqualTo(metadataLogEntries.get(0).file()); + assertThat(metadataLog.getField("latest_snapshot_id")).isNull(); + assertThat(metadataLog.getField("latest_schema_id")).isNull(); + assertThat(metadataLog.getField("latest_sequence_number")).isNull(); + + metadataLog = metadataLogs.get(1); + assertThat(metadataLog.getField("timestamp")) + .isEqualTo(Instant.ofEpochMilli(metadataLogEntries.get(1).timestampMillis())); + assertThat(metadataLog.getField("file")).isEqualTo(metadataLogEntries.get(1).file()); + assertThat(metadataLog.getField("latest_snapshot_id")).isEqualTo(parentSnapshot.snapshotId()); + assertThat(metadataLog.getField("latest_schema_id")).isEqualTo(parentSnapshot.schemaId()); + assertThat(metadataLog.getField("latest_sequence_number")) + .isEqualTo(parentSnapshot.sequenceNumber()); + assertThat(metadataLog.getField("latest_snapshot_id")).isEqualTo(parentSnapshot.snapshotId()); + + metadataLog = metadataLogs.get(2); + assertThat(metadataLog.getField("timestamp")) + .isEqualTo(Instant.ofEpochMilli(currentSnapshot.timestampMillis())); + assertThat(metadataLog.getField("file")).isEqualTo(tableMetadata.metadataFileLocation()); + assertThat(metadataLog.getField("latest_snapshot_id")).isEqualTo(currentSnapshot.snapshotId()); + assertThat(metadataLog.getField("latest_schema_id")).isEqualTo(currentSnapshot.schemaId()); + assertThat(metadataLog.getField("latest_sequence_number")) + .isEqualTo(currentSnapshot.sequenceNumber()); + + // test filtering + List metadataLogWithFilters = + sql( + "SELECT * FROM %s$metadata_log_entries WHERE latest_snapshot_id = %s", + TABLE_NAME, currentSnapshotId); + assertThat(metadataLogWithFilters).hasSize(1); + metadataLog = metadataLogWithFilters.get(0); + assertThat(Instant.ofEpochMilli(tableMetadata.currentSnapshot().timestampMillis())) + .isEqualTo(metadataLog.getField("timestamp")); + + assertThat(metadataLog.getField("file")).isEqualTo(tableMetadata.metadataFileLocation()); + assertThat(metadataLog.getField("latest_snapshot_id")) + .isEqualTo(tableMetadata.currentSnapshot().snapshotId()); + assertThat(metadataLog.getField("latest_schema_id")) + .isEqualTo(tableMetadata.currentSnapshot().schemaId()); + assertThat(metadataLog.getField("latest_sequence_number")) + .isEqualTo(tableMetadata.currentSnapshot().sequenceNumber()); + + // test projection + List metadataFiles = + metadataLogEntries.stream() + .map(TableMetadata.MetadataLogEntry::file) + .collect(Collectors.toList()); + metadataFiles.add(tableMetadata.metadataFileLocation()); + List metadataLogWithProjection = + sql("SELECT file FROM %s$metadata_log_entries", TABLE_NAME); + assertThat(metadataLogWithProjection).hasSize(3); + for (int i = 0; i < metadataFiles.size(); i++) { + assertThat(metadataLogWithProjection.get(i).getField("file")).isEqualTo(metadataFiles.get(i)); + } + } + + @TestTemplate + public void testSnapshotReferencesMetatable() { + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + + Long currentSnapshotId = table.currentSnapshot().snapshotId(); + + // Create branch + table + .manageSnapshots() + .createBranch("testBranch", currentSnapshotId) + .setMaxRefAgeMs("testBranch", 10) + .setMinSnapshotsToKeep("testBranch", 20) + .setMaxSnapshotAgeMs("testBranch", 30) + .commit(); + // Create Tag + table + .manageSnapshots() + .createTag("testTag", currentSnapshotId) + .setMaxRefAgeMs("testTag", 50) + .commit(); + // Check refs table + List references = sql("SELECT * FROM %s$refs", TABLE_NAME); + List branches = sql("SELECT * FROM %s$refs WHERE type='BRANCH'", TABLE_NAME); + assertThat(references).hasSize(3); + assertThat(branches).hasSize(2); + List tags = sql("SELECT * FROM %s$refs WHERE type='TAG'", TABLE_NAME); + assertThat(tags).hasSize(1); + // Check branch entries in refs table + List mainBranch = + sql("SELECT * FROM %s$refs WHERE name='main' AND type='BRANCH'", TABLE_NAME); + assertThat((String) mainBranch.get(0).getFieldAs("name")).isEqualTo("main"); + assertThat((String) mainBranch.get(0).getFieldAs("type")).isEqualTo("BRANCH"); + assertThat((Long) mainBranch.get(0).getFieldAs("snapshot_id")).isEqualTo(currentSnapshotId); + List testBranch = + sql("SELECT * FROM %s$refs WHERE name='testBranch' AND type='BRANCH'", TABLE_NAME); + assertThat((String) testBranch.get(0).getFieldAs("name")).isEqualTo("testBranch"); + assertThat((String) testBranch.get(0).getFieldAs("type")).isEqualTo("BRANCH"); + assertThat((Long) testBranch.get(0).getFieldAs("snapshot_id")).isEqualTo(currentSnapshotId); + assertThat((Long) testBranch.get(0).getFieldAs("max_reference_age_in_ms")) + .isEqualTo(Long.valueOf(10)); + assertThat((Integer) testBranch.get(0).getFieldAs("min_snapshots_to_keep")) + .isEqualTo(Integer.valueOf(20)); + assertThat((Long) testBranch.get(0).getFieldAs("max_snapshot_age_in_ms")) + .isEqualTo(Long.valueOf(30)); + + // Check tag entries in refs table + List testTag = + sql("SELECT * FROM %s$refs WHERE name='testTag' AND type='TAG'", TABLE_NAME); + assertThat((String) testTag.get(0).getFieldAs("name")).isEqualTo("testTag"); + assertThat((String) testTag.get(0).getFieldAs("type")).isEqualTo("TAG"); + assertThat((Long) testTag.get(0).getFieldAs("snapshot_id")).isEqualTo(currentSnapshotId); + assertThat((Long) testTag.get(0).getFieldAs("max_reference_age_in_ms")) + .isEqualTo(Long.valueOf(50)); + // Check projection in refs table + List testTagProjection = + sql( + "SELECT name,type,snapshot_id,max_reference_age_in_ms,min_snapshots_to_keep FROM %s$refs where type='TAG'", + TABLE_NAME); + assertThat((String) testTagProjection.get(0).getFieldAs("name")).isEqualTo("testTag"); + assertThat((String) testTagProjection.get(0).getFieldAs("type")).isEqualTo("TAG"); + assertThat((Long) testTagProjection.get(0).getFieldAs("snapshot_id")) + .isEqualTo(currentSnapshotId); + assertThat((Long) testTagProjection.get(0).getFieldAs("max_reference_age_in_ms")) + .isEqualTo(Long.valueOf(50)); + assertThat((String) testTagProjection.get(0).getFieldAs("min_snapshots_to_keep")).isNull(); + List mainBranchProjection = + sql("SELECT name, type FROM %s$refs WHERE name='main' AND type = 'BRANCH'", TABLE_NAME); + assertThat((String) mainBranchProjection.get(0).getFieldAs("name")).isEqualTo("main"); + assertThat((String) mainBranchProjection.get(0).getFieldAs("type")).isEqualTo("BRANCH"); + List testBranchProjection = + sql( + "SELECT type, name, max_reference_age_in_ms, snapshot_id FROM %s$refs WHERE name='testBranch' AND type = 'BRANCH'", + TABLE_NAME); + assertThat((String) testBranchProjection.get(0).getFieldAs("name")).isEqualTo("testBranch"); + assertThat((String) testBranchProjection.get(0).getFieldAs("type")).isEqualTo("BRANCH"); + assertThat((Long) testBranchProjection.get(0).getFieldAs("snapshot_id")) + .isEqualTo(currentSnapshotId); + assertThat((Long) testBranchProjection.get(0).getFieldAs("max_reference_age_in_ms")) + .isEqualTo(Long.valueOf(10)); + } + + /** + * Find matching manifest entries of an Iceberg table + * + * @param table iceberg table + * @param expectedContent file content to populate on entries + * @param entriesTableSchema schema of Manifest entries + * @param manifestsToExplore manifests to explore of the table + * @param partValue partition value that manifest entries must match, or null to skip filtering + */ + private List expectedEntries( + Table table, + FileContent expectedContent, + Schema entriesTableSchema, + List manifestsToExplore, + String partValue) + throws IOException { + List expected = Lists.newArrayList(); + for (ManifestFile manifest : manifestsToExplore) { + InputFile in = table.io().newInputFile(manifest.path()); + try (CloseableIterable rows = + Avro.read(in).project(entriesTableSchema).build()) { + for (GenericData.Record record : rows) { + if ((Integer) record.get("status") < 2 /* added or existing */) { + GenericData.Record file = (GenericData.Record) record.get("data_file"); + if (partitionMatch(file, partValue)) { + asMetadataRecord(file, expectedContent); + expected.add(file); + } + } + } + } + } + return expected; + } + + // Populate certain fields derived in the metadata tables + private void asMetadataRecord(GenericData.Record file, FileContent content) { + file.put(0, content.id()); + file.put(3, 0); // specId + } + + private boolean partitionMatch(GenericData.Record file, String partValue) { + if (partValue == null) { + return true; + } + GenericData.Record partition = (GenericData.Record) file.get(4); + return partValue.equals(partition.get(0).toString()); + } + + private List dataManifests(Table table) { + return table.currentSnapshot().dataManifests(table.io()); + } + + private List allDataManifests(Table table) { + List manifests = Lists.newArrayList(); + for (Snapshot snapshot : table.snapshots()) { + manifests.addAll(snapshot.dataManifests(table.io())); + } + return manifests; + } + + private List deleteManifests(Table table) { + return table.currentSnapshot().deleteManifests(table.io()); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java new file mode 100644 index 000000000000..188a44d7cdba --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.util.Map; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.iceberg.BaseTable; +import org.apache.iceberg.CatalogUtil; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableOperations; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.DeleteReadTests; +import org.apache.iceberg.hive.HiveCatalog; +import org.apache.iceberg.hive.TestHiveMetastore; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public abstract class TestFlinkReaderDeletesBase extends DeleteReadTests { + + protected static String databaseName = "default"; + + protected static HiveConf hiveConf = null; + protected static HiveCatalog catalog = null; + private static TestHiveMetastore metastore = null; + + @BeforeAll + public static void startMetastore() { + metastore = new TestHiveMetastore(); + metastore.start(); + hiveConf = metastore.hiveConf(); + catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + } + + @AfterAll + public static void stopMetastore() throws Exception { + metastore.stop(); + catalog = null; + } + + @Override + protected Table createTable(String name, Schema schema, PartitionSpec spec) { + Map props = Maps.newHashMap(); + props.put(TableProperties.DEFAULT_FILE_FORMAT, format.name()); + + Table table = catalog.createTable(TableIdentifier.of(databaseName, name), schema, spec, props); + TableOperations ops = ((BaseTable) table).operations(); + TableMetadata meta = ops.current(); + ops.commit(meta, meta.upgradeToFormatVersion(formatVersion)); + + return table; + } + + @Override + protected void dropTable(String name) { + catalog.dropTable(TableIdentifier.of(databaseName, name)); + } + + @Override + protected boolean expectPruned() { + return false; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java new file mode 100644 index 000000000000..cf6b233dcec6 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java @@ -0,0 +1,540 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.nio.file.Path; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.DateTimeUtil; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public abstract class TestFlinkScan { + @RegisterExtension + protected static MiniClusterExtension miniClusterExtension = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + @TempDir protected Path temporaryDirectory; + + @RegisterExtension + protected static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); + + @Parameter protected FileFormat fileFormat; + + @Parameters(name = "format={0}") + public static Collection fileFormat() { + return Arrays.asList(FileFormat.AVRO, FileFormat.PARQUET, FileFormat.ORC); + } + + protected TableLoader tableLoader() { + return CATALOG_EXTENSION.tableLoader(); + } + + protected abstract List runWithProjection(String... projected) throws Exception; + + protected abstract List runWithFilter( + Expression filter, String sqlFilter, boolean caseSensitive) throws Exception; + + protected List runWithFilter(Expression filter, String sqlFilter) throws Exception { + return runWithFilter(filter, sqlFilter, true); + } + + protected abstract List runWithOptions(Map options) throws Exception; + + protected abstract List run() throws Exception; + + @TestTemplate + public void testUnpartitionedTable() throws Exception { + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); + new GenericAppenderHelper(table, fileFormat, temporaryDirectory).appendToTable(expectedRecords); + TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); + } + + @TestTemplate + public void testPartitionedTable() throws Exception { + Table table = + CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + expectedRecords.get(0).set(2, "2020-03-20"); + new GenericAppenderHelper(table, fileFormat, temporaryDirectory) + .appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); + TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); + } + + @TestTemplate + public void testProjection() throws Exception { + Table table = + CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + List inputRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + new GenericAppenderHelper(table, fileFormat, temporaryDirectory) + .appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), inputRecords); + assertRows(runWithProjection("data"), Row.of(inputRecords.get(0).get(0))); + } + + @TestTemplate + public void testIdentityPartitionProjections() throws Exception { + Schema logSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get()), + Types.NestedField.optional(3, "level", Types.StringType.get()), + Types.NestedField.optional(4, "message", Types.StringType.get())); + PartitionSpec spec = + PartitionSpec.builderFor(logSchema).identity("dt").identity("level").build(); + + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, logSchema, spec); + List inputRecords = RandomGenericData.generate(logSchema, 10, 0L); + + int idx = 0; + AppendFiles append = table.newAppend(); + for (Record record : inputRecords) { + record.set(1, "2020-03-2" + idx); + record.set(2, Integer.toString(idx)); + append.appendFile( + new GenericAppenderHelper(table, fileFormat, temporaryDirectory) + .writeFile( + org.apache.iceberg.TestHelpers.Row.of("2020-03-2" + idx, Integer.toString(idx)), + ImmutableList.of(record))); + idx += 1; + } + append.commit(); + + // individual fields + validateIdentityPartitionProjections(table, Collections.singletonList("dt"), inputRecords); + validateIdentityPartitionProjections(table, Collections.singletonList("level"), inputRecords); + validateIdentityPartitionProjections(table, Collections.singletonList("message"), inputRecords); + validateIdentityPartitionProjections(table, Collections.singletonList("id"), inputRecords); + // field pairs + validateIdentityPartitionProjections(table, Arrays.asList("dt", "message"), inputRecords); + validateIdentityPartitionProjections(table, Arrays.asList("level", "message"), inputRecords); + validateIdentityPartitionProjections(table, Arrays.asList("dt", "level"), inputRecords); + // out-of-order pairs + validateIdentityPartitionProjections(table, Arrays.asList("message", "dt"), inputRecords); + validateIdentityPartitionProjections(table, Arrays.asList("message", "level"), inputRecords); + validateIdentityPartitionProjections(table, Arrays.asList("level", "dt"), inputRecords); + // out-of-order triplets + validateIdentityPartitionProjections( + table, Arrays.asList("dt", "level", "message"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("level", "dt", "message"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("dt", "message", "level"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("level", "message", "dt"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("message", "dt", "level"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("message", "level", "dt"), inputRecords); + } + + private void validateIdentityPartitionProjections( + Table table, List projectedFields, List inputRecords) throws Exception { + List rows = runWithProjection(projectedFields.toArray(new String[0])); + + for (int pos = 0; pos < inputRecords.size(); pos++) { + Record inputRecord = inputRecords.get(pos); + Row actualRecord = rows.get(pos); + + for (int i = 0; i < projectedFields.size(); i++) { + String name = projectedFields.get(i); + assertThat(inputRecord.getField(name)) + .as("Projected field " + name + " should match") + .isEqualTo(actualRecord.getField(i)); + } + } + } + + @TestTemplate + public void testSnapshotReads() throws Exception { + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + + GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + + List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + helper.appendToTable(expectedRecords); + long snapshotId = table.currentSnapshot().snapshotId(); + + long timestampMillis = table.currentSnapshot().timestampMillis(); + + // produce another timestamp + waitUntilAfter(timestampMillis); + helper.appendToTable(RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L)); + + TestHelpers.assertRecords( + runWithOptions(ImmutableMap.of("snapshot-id", Long.toString(snapshotId))), + expectedRecords, + TestFixtures.SCHEMA); + TestHelpers.assertRecords( + runWithOptions(ImmutableMap.of("as-of-timestamp", Long.toString(timestampMillis))), + expectedRecords, + TestFixtures.SCHEMA); + } + + @TestTemplate + public void testTagReads() throws Exception { + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + + GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + + List expectedRecords1 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + helper.appendToTable(expectedRecords1); + long snapshotId = table.currentSnapshot().snapshotId(); + + table.manageSnapshots().createTag("t1", snapshotId).commit(); + + TestHelpers.assertRecords( + runWithOptions(ImmutableMap.of("tag", "t1")), expectedRecords1, TestFixtures.SCHEMA); + + List expectedRecords2 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + helper.appendToTable(expectedRecords2); + snapshotId = table.currentSnapshot().snapshotId(); + + table.manageSnapshots().replaceTag("t1", snapshotId).commit(); + + List expectedRecords = Lists.newArrayList(); + expectedRecords.addAll(expectedRecords1); + expectedRecords.addAll(expectedRecords2); + TestHelpers.assertRecords( + runWithOptions(ImmutableMap.of("tag", "t1")), expectedRecords, TestFixtures.SCHEMA); + } + + @TestTemplate + public void testBranchReads() throws Exception { + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + + GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + + List expectedRecordsBase = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + helper.appendToTable(expectedRecordsBase); + long snapshotId = table.currentSnapshot().snapshotId(); + + String branchName = "b1"; + table.manageSnapshots().createBranch(branchName, snapshotId).commit(); + + List expectedRecordsForBranch = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + helper.appendToTable(branchName, expectedRecordsForBranch); + + List expectedRecordsForMain = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + helper.appendToTable(expectedRecordsForMain); + + List branchExpectedRecords = Lists.newArrayList(); + branchExpectedRecords.addAll(expectedRecordsBase); + branchExpectedRecords.addAll(expectedRecordsForBranch); + + TestHelpers.assertRecords( + runWithOptions(ImmutableMap.of("branch", branchName)), + branchExpectedRecords, + TestFixtures.SCHEMA); + + List mainExpectedRecords = Lists.newArrayList(); + mainExpectedRecords.addAll(expectedRecordsBase); + mainExpectedRecords.addAll(expectedRecordsForMain); + + TestHelpers.assertRecords(run(), mainExpectedRecords, TestFixtures.SCHEMA); + } + + @TestTemplate + public void testIncrementalReadViaTag() throws Exception { + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + + GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + + List records1 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + helper.appendToTable(records1); + long snapshotId1 = table.currentSnapshot().snapshotId(); + String startTag = "t1"; + table.manageSnapshots().createTag(startTag, snapshotId1).commit(); + + List records2 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1L); + helper.appendToTable(records2); + + List records3 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 2L); + helper.appendToTable(records3); + long snapshotId3 = table.currentSnapshot().snapshotId(); + String endTag = "t2"; + table.manageSnapshots().createTag(endTag, snapshotId3).commit(); + + helper.appendToTable(RandomGenericData.generate(TestFixtures.SCHEMA, 1, 3L)); + + List expected = Lists.newArrayList(); + expected.addAll(records2); + expected.addAll(records3); + + TestHelpers.assertRecords( + runWithOptions( + ImmutableMap.builder() + .put("start-tag", startTag) + .put("end-tag", endTag) + .buildOrThrow()), + expected, + TestFixtures.SCHEMA); + + TestHelpers.assertRecords( + runWithOptions( + ImmutableMap.builder() + .put("start-snapshot-id", Long.toString(snapshotId1)) + .put("end-tag", endTag) + .buildOrThrow()), + expected, + TestFixtures.SCHEMA); + + TestHelpers.assertRecords( + runWithOptions( + ImmutableMap.builder() + .put("start-tag", startTag) + .put("end-snapshot-id", Long.toString(snapshotId3)) + .buildOrThrow()), + expected, + TestFixtures.SCHEMA); + + assertThatThrownBy( + () -> + runWithOptions( + ImmutableMap.builder() + .put("start-tag", startTag) + .put("end-tag", endTag) + .put("start-snapshot-id", Long.toString(snapshotId1)) + .buildOrThrow())) + .isInstanceOf(Exception.class) + .hasMessage("START_SNAPSHOT_ID and START_TAG cannot both be set."); + + assertThatThrownBy( + () -> + runWithOptions( + ImmutableMap.builder() + .put("start-tag", startTag) + .put("end-tag", endTag) + .put("end-snapshot-id", Long.toString(snapshotId3)) + .buildOrThrow())) + .isInstanceOf(Exception.class) + .hasMessage("END_SNAPSHOT_ID and END_TAG cannot both be set."); + } + + @TestTemplate + public void testIncrementalRead() throws Exception { + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + + GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + + List records1 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + helper.appendToTable(records1); + long snapshotId1 = table.currentSnapshot().snapshotId(); + + // snapshot 2 + List records2 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1L); + helper.appendToTable(records2); + + List records3 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 2L); + helper.appendToTable(records3); + long snapshotId3 = table.currentSnapshot().snapshotId(); + + // snapshot 4 + helper.appendToTable(RandomGenericData.generate(TestFixtures.SCHEMA, 1, 3L)); + + List expected2 = Lists.newArrayList(); + expected2.addAll(records2); + expected2.addAll(records3); + TestHelpers.assertRecords( + runWithOptions( + ImmutableMap.builder() + .put("start-snapshot-id", Long.toString(snapshotId1)) + .put("end-snapshot-id", Long.toString(snapshotId3)) + .buildOrThrow()), + expected2, + TestFixtures.SCHEMA); + } + + @TestTemplate + public void testFilterExpPartition() throws Exception { + Table table = + CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + + List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); + expectedRecords.get(0).set(2, "2020-03-20"); + expectedRecords.get(1).set(2, "2020-03-20"); + + GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + DataFile dataFile1 = + helper.writeFile(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); + DataFile dataFile2 = + helper.writeFile( + org.apache.iceberg.TestHelpers.Row.of("2020-03-21", 0), + RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); + helper.appendToTable(dataFile1, dataFile2); + TestHelpers.assertRecords( + runWithFilter(Expressions.equal("dt", "2020-03-20"), "where dt='2020-03-20'", true), + expectedRecords, + TestFixtures.SCHEMA); + } + + private void testFilterExp(Expression filter, String sqlFilter, boolean caseSensitive) + throws Exception { + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + + List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 3, 0L); + expectedRecords.get(0).set(0, "a"); + expectedRecords.get(1).set(0, "b"); + expectedRecords.get(2).set(0, "c"); + + GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + DataFile dataFile = helper.writeFile(expectedRecords); + helper.appendToTable(dataFile); + + List actual = + runWithFilter(Expressions.greaterThanOrEqual("data", "b"), "where data>='b'", true); + + TestHelpers.assertRecords(actual, expectedRecords.subList(1, 3), TestFixtures.SCHEMA); + } + + @TestTemplate + public void testFilterExp() throws Exception { + testFilterExp(Expressions.greaterThanOrEqual("data", "b"), "where data>='b'", true); + } + + @TestTemplate + public void testFilterExpCaseInsensitive() throws Exception { + // sqlFilter does not support case-insensitive filtering: + // https://issues.apache.org/jira/browse/FLINK-16175 + testFilterExp(Expressions.greaterThanOrEqual("DATA", "b"), "where data>='b'", false); + } + + @TestTemplate + public void testPartitionTypes() throws Exception { + Schema typesSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "decimal", Types.DecimalType.of(38, 18)), + Types.NestedField.optional(3, "str", Types.StringType.get()), + Types.NestedField.optional(4, "binary", Types.BinaryType.get()), + Types.NestedField.optional(5, "date", Types.DateType.get()), + Types.NestedField.optional(6, "time", Types.TimeType.get()), + Types.NestedField.optional(7, "timestamp", Types.TimestampType.withoutZone())); + PartitionSpec spec = + PartitionSpec.builderFor(typesSchema) + .identity("decimal") + .identity("str") + .identity("binary") + .identity("date") + .identity("time") + .identity("timestamp") + .build(); + + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, typesSchema, spec); + List records = RandomGenericData.generate(typesSchema, 10, 0L); + GenericAppenderHelper appender = + new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + for (Record record : records) { + org.apache.iceberg.TestHelpers.Row partition = + org.apache.iceberg.TestHelpers.Row.of( + record.get(1), + record.get(2), + record.get(3), + record.get(4) == null ? null : DateTimeUtil.daysFromDate((LocalDate) record.get(4)), + record.get(5) == null ? null : DateTimeUtil.microsFromTime((LocalTime) record.get(5)), + record.get(6) == null + ? null + : DateTimeUtil.microsFromTimestamp((LocalDateTime) record.get(6))); + appender.appendToTable(partition, Collections.singletonList(record)); + } + + TestHelpers.assertRecords(run(), records, typesSchema); + } + + @TestTemplate + public void testCustomizedFlinkDataTypes() throws Exception { + Schema schema = + new Schema( + Types.NestedField.required( + 1, + "map", + Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.StringType.get())), + Types.NestedField.required( + 4, "arr", Types.ListType.ofRequired(5, Types.StringType.get()))); + Table table = CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, schema); + List records = RandomGenericData.generate(schema, 10, 0L); + GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + helper.appendToTable(records); + TestHelpers.assertRecords(run(), records, schema); + } + + private static void assertRows(List results, Row... expected) { + TestHelpers.assertRows(results, Arrays.asList(expected)); + } + + private static void waitUntilAfter(long timestampMillis) { + long current = System.currentTimeMillis(); + while (current <= timestampMillis) { + current = System.currentTimeMillis(); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java new file mode 100644 index 000000000000..1493c0932044 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.config.TableConfigOptions; +import org.apache.flink.types.Row; +import org.junit.jupiter.api.BeforeEach; + +/** Test Flink SELECT SQLs. */ +public class TestFlinkScanSql extends TestFlinkSource { + private volatile TableEnvironment tEnv; + + @BeforeEach + public void before() throws IOException { + SqlHelpers.sql( + getTableEnv(), + "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + CATALOG_EXTENSION.warehouse()); + SqlHelpers.sql(getTableEnv(), "use catalog iceberg_catalog"); + getTableEnv() + .getConfig() + .getConfiguration() + .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); + } + + private TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + if (tEnv == null) { + this.tEnv = + TableEnvironment.create(EnvironmentSettings.newInstance().inBatchMode().build()); + } + } + } + return tEnv; + } + + @Override + protected List run( + FlinkSource.Builder formatBuilder, + Map sqlOptions, + String sqlFilter, + String... sqlSelectedFields) { + String select = String.join(",", sqlSelectedFields); + String optionStr = SqlHelpers.sqlOptionsToString(sqlOptions); + return SqlHelpers.sql(getTableEnv(), "select %s from t %s %s", select, optionStr, sqlFilter); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java new file mode 100644 index 000000000000..0e5b0f335418 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import org.apache.flink.table.legacy.api.TableColumn; +import org.apache.flink.table.legacy.api.TableSchema; +import org.apache.flink.types.Row; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +public abstract class TestFlinkSource extends TestFlinkScan { + + @Override + protected List runWithProjection(String... projected) throws Exception { + TableSchema.Builder builder = TableSchema.builder(); + TableSchema schema = + FlinkSchemaUtil.toSchema( + FlinkSchemaUtil.convert( + CATALOG_EXTENSION.catalog().loadTable(TestFixtures.TABLE_IDENTIFIER).schema())); + for (String field : projected) { + TableColumn column = schema.getTableColumn(field).get(); + builder.field(column.getName(), column.getType()); + } + return run(FlinkSource.forRowData().project(builder.build()), Maps.newHashMap(), "", projected); + } + + @Override + protected List runWithFilter(Expression filter, String sqlFilter, boolean caseSensitive) + throws Exception { + FlinkSource.Builder builder = + FlinkSource.forRowData().filters(Collections.singletonList(filter)); + Map options = Maps.newHashMap(); + options.put("case-sensitive", Boolean.toString(caseSensitive)); + return run(builder, options, sqlFilter, "*"); + } + + @Override + protected List runWithOptions(Map options) throws Exception { + FlinkSource.Builder builder = FlinkSource.forRowData(); + Optional.ofNullable(options.get("case-sensitive")) + .ifPresent(value -> builder.caseSensitive(Boolean.parseBoolean(value))); + Optional.ofNullable(options.get("snapshot-id")) + .ifPresent(value -> builder.snapshotId(Long.parseLong(value))); + Optional.ofNullable(options.get("tag")).ifPresent(value -> builder.tag(value)); + Optional.ofNullable(options.get("branch")).ifPresent(value -> builder.branch(value)); + Optional.ofNullable(options.get("start-tag")).ifPresent(value -> builder.startTag(value)); + Optional.ofNullable(options.get("end-tag")).ifPresent(value -> builder.endTag(value)); + Optional.ofNullable(options.get("start-snapshot-id")) + .ifPresent(value -> builder.startSnapshotId(Long.parseLong(value))); + Optional.ofNullable(options.get("end-snapshot-id")) + .ifPresent(value -> builder.endSnapshotId(Long.parseLong(value))); + Optional.ofNullable(options.get("as-of-timestamp")) + .ifPresent(value -> builder.asOfTimestamp(Long.parseLong(value))); + return run(builder, options, "", "*"); + } + + @Override + protected List run() throws Exception { + return run(FlinkSource.forRowData(), Maps.newHashMap(), "", "*"); + } + + protected abstract List run( + FlinkSource.Builder formatBuilder, + Map sqlOptions, + String sqlFilter, + String... sqlSelectedFields) + throws Exception; +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java new file mode 100644 index 000000000000..14131d9e96d5 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.List; +import org.apache.flink.types.Row; +import org.apache.iceberg.flink.FlinkReadOptions; +import org.junit.jupiter.api.TestTemplate; + +public class TestFlinkSourceConfig extends TableSourceTestBase { + private static final String TABLE = "test_table"; + + @TestTemplate + public void testFlinkSessionConfig() { + getTableEnv().getConfig().set(FlinkReadOptions.STREAMING_OPTION, true); + assertThatThrownBy(() -> sql("SELECT * FROM %s /*+ OPTIONS('as-of-timestamp'='1')*/", TABLE)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot set as-of-timestamp option for streaming reader"); + } + + @TestTemplate + public void testFlinkHintConfig() { + List result = + sql( + "SELECT * FROM %s /*+ OPTIONS('as-of-timestamp'='%d','streaming'='false')*/", + TABLE, System.currentTimeMillis()); + assertThat(result).hasSize(3); + } + + @TestTemplate + public void testReadOptionHierarchy() { + getTableEnv().getConfig().set(FlinkReadOptions.LIMIT_OPTION, 1L); + List result = sql("SELECT * FROM %s", TABLE); + // Note that this query doesn't have the limit clause in the SQL. + // This assertions works because limit is pushed down to the reader and + // reader parallelism is 1. + assertThat(result).hasSize(1); + + result = sql("SELECT * FROM %s /*+ OPTIONS('limit'='3')*/", TABLE); + assertThat(result).hasSize(3); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java new file mode 100644 index 000000000000..2dc5bc5c658e --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.util.List; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.PipelineOptions; +import org.apache.flink.table.api.config.TableConfigOptions; +import org.apache.flink.types.Row; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Table; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +/** Use the FlinkSource */ +public class TestFlinkSourceSql extends TestSqlBase { + @BeforeEach + @Override + public void before() throws IOException { + SqlHelpers.sql( + getTableEnv(), + "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + CATALOG_EXTENSION.warehouse()); + SqlHelpers.sql(getTableEnv(), "use catalog iceberg_catalog"); + getTableEnv() + .getConfig() + .getConfiguration() + .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); + } + + @Test + public void testInferParallelismWithGlobalSetting() throws IOException { + Configuration cfg = getTableEnv().getConfig().getConfiguration(); + cfg.set(PipelineOptions.MAX_PARALLELISM, 1); + + Table table = + CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, null); + + GenericAppenderHelper helper = + new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); + List expectedRecords = Lists.newArrayList(); + long maxFileLen = 0; + for (int i = 0; i < 5; i++) { + List records = RandomGenericData.generate(TestFixtures.SCHEMA, 2, i); + DataFile dataFile = helper.writeFile(null, records); + helper.appendToTable(dataFile); + expectedRecords.addAll(records); + maxFileLen = Math.max(dataFile.fileSizeInBytes(), maxFileLen); + } + + // Make sure to generate multiple CombinedScanTasks + SqlHelpers.sql( + getTableEnv(), + "ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", + maxFileLen); + + List results = run(Maps.newHashMap(), "", "*"); + org.apache.iceberg.flink.TestHelpers.assertRecords( + results, expectedRecords, TestFixtures.SCHEMA); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java new file mode 100644 index 000000000000..18528c789114 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java @@ -0,0 +1,561 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.List; +import org.apache.flink.table.api.SqlParserException; +import org.apache.flink.types.Row; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.TestTemplate; + +public class TestFlinkTableSource extends TableSourceTestBase { + + @TestTemplate + public void testLimitPushDown() { + + assertThatThrownBy(() -> sql("SELECT * FROM %s LIMIT -1", TABLE_NAME)) + .isInstanceOf(SqlParserException.class) + .hasMessageStartingWith("SQL parse failed."); + + assertThat(sql("SELECT * FROM %s LIMIT 0", TABLE_NAME)).isEmpty(); + + String sqlLimitExceed = String.format("SELECT * FROM %s LIMIT 4", TABLE_NAME); + List resultExceed = sql(sqlLimitExceed); + assertThat(resultExceed).hasSize(3); + List expectedList = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); + assertSameElements(expectedList, resultExceed); + + String querySql = String.format("SELECT * FROM %s LIMIT 1", TABLE_NAME); + String explain = getTableEnv().explainSql(querySql); + String expectedExplain = "limit=[1]"; + assertThat(explain).as("Explain should contain LimitPushDown").contains(expectedExplain); + List result = sql(querySql); + assertThat(result).hasSize(1); + assertThat(result).containsAnyElementsOf(expectedList); + + String sqlMixed = String.format("SELECT * FROM %s WHERE id = 1 LIMIT 2", TABLE_NAME); + List mixedResult = sql(sqlMixed); + assertThat(mixedResult).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + } + + @TestTemplate + public void testNoFilterPushDown() { + String sql = String.format("SELECT * FROM %s ", TABLE_NAME); + List result = sql(sql); + List expectedRecords = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); + assertSameElements(expectedRecords, result); + assertThat(lastScanEvent.filter()) + .as("Should not push down a filter") + .isEqualTo(Expressions.alwaysTrue()); + } + + @TestTemplate + public void testFilterPushDownEqual() { + String sqlLiteralRight = String.format("SELECT * FROM %s WHERE id = 1 ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") == 1"; + + List result = sql(sqlLiteralRight); + assertThat(result).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownEqualNull() { + String sqlEqualNull = String.format("SELECT * FROM %s WHERE data = NULL ", TABLE_NAME); + + List result = sql(sqlEqualNull); + assertThat(result).isEmpty(); + assertThat(lastScanEvent).as("Should not push down a filter").isNull(); + } + + @TestTemplate + public void testFilterPushDownEqualLiteralOnLeft() { + String sqlLiteralLeft = String.format("SELECT * FROM %s WHERE 1 = id ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") == 1"; + + List resultLeft = sql(sqlLiteralLeft); + assertThat(resultLeft).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownNoEqual() { + String sqlNE = String.format("SELECT * FROM %s WHERE id <> 1 ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") != 1"; + + List resultNE = sql(sqlNE); + assertThat(resultNE).hasSize(2); + + List expectedNE = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); + assertSameElements(expectedNE, resultNE); + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownNoEqualNull() { + String sqlNotEqualNull = String.format("SELECT * FROM %s WHERE data <> NULL ", TABLE_NAME); + + List resultNE = sql(sqlNotEqualNull); + assertThat(resultNE).isEmpty(); + assertThat(lastScanEvent).as("Should not push down a filter").isNull(); + } + + @TestTemplate + public void testFilterPushDownAnd() { + String sqlAnd = + String.format("SELECT * FROM %s WHERE id = 1 AND data = 'iceberg' ", TABLE_NAME); + + List resultAnd = sql(sqlAnd); + assertThat(resultAnd).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + assertThat(scanEventCount).isEqualTo(1); + String expected = "(ref(name=\"id\") == 1 and ref(name=\"data\") == \"iceberg\")"; + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expected); + } + + @TestTemplate + public void testFilterPushDownOr() { + String sqlOr = String.format("SELECT * FROM %s WHERE id = 1 OR data = 'b' ", TABLE_NAME); + String expectedFilter = "(ref(name=\"id\") == 1 or ref(name=\"data\") == \"b\")"; + + List resultOr = sql(sqlOr); + assertThat(resultOr).hasSize(2); + + List expectedOR = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); + assertSameElements(expectedOR, resultOr); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownGreaterThan() { + String sqlGT = String.format("SELECT * FROM %s WHERE id > 1 ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") > 1"; + + List resultGT = sql(sqlGT); + assertThat(resultGT).hasSize(2); + + List expectedGT = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); + assertSameElements(expectedGT, resultGT); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownGreaterThanNull() { + String sqlGT = String.format("SELECT * FROM %s WHERE data > null ", TABLE_NAME); + + List resultGT = sql(sqlGT); + assertThat(resultGT).isEmpty(); + assertThat(lastScanEvent).as("Should not push down a filter").isNull(); + } + + @TestTemplate + public void testFilterPushDownGreaterThanLiteralOnLeft() { + String sqlGT = String.format("SELECT * FROM %s WHERE 3 > id ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") < 3"; + + List resultGT = sql(sqlGT); + assertThat(resultGT).hasSize(2); + + List expectedGT = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); + assertSameElements(expectedGT, resultGT); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownGreaterThanEqual() { + String sqlGTE = String.format("SELECT * FROM %s WHERE id >= 2 ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") >= 2"; + + List resultGTE = sql(sqlGTE); + assertThat(resultGTE).hasSize(2); + + List expectedGTE = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); + assertSameElements(expectedGTE, resultGTE); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownGreaterThanEqualNull() { + String sqlGTE = String.format("SELECT * FROM %s WHERE data >= null ", TABLE_NAME); + + List resultGT = sql(sqlGTE); + assertThat(resultGT).isEmpty(); + assertThat(lastScanEvent).as("Should not push down a filter").isNull(); + } + + @TestTemplate + public void testFilterPushDownGreaterThanEqualLiteralOnLeft() { + String sqlGTE = String.format("SELECT * FROM %s WHERE 2 >= id ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") <= 2"; + + List resultGTE = sql(sqlGTE); + assertThat(resultGTE).hasSize(2); + + List expectedGTE = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); + assertSameElements(expectedGTE, resultGTE); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownLessThan() { + String sqlLT = String.format("SELECT * FROM %s WHERE id < 2 ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") < 2"; + + List resultLT = sql(sqlLT); + assertThat(resultLT).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownLessThanNull() { + String sqlLT = String.format("SELECT * FROM %s WHERE data < null ", TABLE_NAME); + + List resultGT = sql(sqlLT); + assertThat(resultGT).isEmpty(); + assertThat(lastScanEvent).as("Should not push down a filter").isNull(); + } + + @TestTemplate + public void testFilterPushDownLessThanLiteralOnLeft() { + String sqlLT = String.format("SELECT * FROM %s WHERE 2 < id ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") > 2"; + + List resultLT = sql(sqlLT); + assertThat(resultLT).hasSize(1).first().isEqualTo(Row.of(3, null, 30.0)); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownLessThanEqual() { + String sqlLTE = String.format("SELECT * FROM %s WHERE id <= 1 ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") <= 1"; + + List resultLTE = sql(sqlLTE); + assertThat(resultLTE).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownLessThanEqualNull() { + String sqlLTE = String.format("SELECT * FROM %s WHERE data <= null ", TABLE_NAME); + + List resultGT = sql(sqlLTE); + assertThat(resultGT).isEmpty(); + assertThat(lastScanEvent).as("Should not push down a filter").isNull(); + } + + @TestTemplate + public void testFilterPushDownLessThanEqualLiteralOnLeft() { + String sqlLTE = String.format("SELECT * FROM %s WHERE 3 <= id ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") >= 3"; + + List resultLTE = sql(sqlLTE); + assertThat(resultLTE).hasSize(1).first().isEqualTo(Row.of(3, null, 30.0)); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownIn() { + String sqlIN = String.format("SELECT * FROM %s WHERE id IN (1,2) ", TABLE_NAME); + String expectedFilter = "(ref(name=\"id\") == 1 or ref(name=\"id\") == 2)"; + List resultIN = sql(sqlIN); + assertThat(resultIN).hasSize(2); + + List expectedIN = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); + assertSameElements(expectedIN, resultIN); + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownInNull() { + String sqlInNull = + String.format("SELECT * FROM %s WHERE data IN ('iceberg',NULL) ", TABLE_NAME); + + List result = sql(sqlInNull); + assertThat(result).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + + // In SQL, null check can only be done as IS NULL or IS NOT NULL, so it's correct to ignore it + // and push the rest down. + String expectedScan = "ref(name=\"data\") == \"iceberg\""; + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedScan); + } + + @TestTemplate + public void testFilterPushDownNotIn() { + String sqlNotIn = String.format("SELECT * FROM %s WHERE id NOT IN (3,2) ", TABLE_NAME); + + List resultNotIn = sql(sqlNotIn); + assertThat(resultNotIn).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + assertThat(scanEventCount).isEqualTo(1); + String expectedScan = "(ref(name=\"id\") != 2 and ref(name=\"id\") != 3)"; + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedScan); + } + + @TestTemplate + public void testFilterPushDownNotInNull() { + String sqlNotInNull = String.format("SELECT * FROM %s WHERE id NOT IN (1,2,NULL) ", TABLE_NAME); + List resultGT = sql(sqlNotInNull); + assertThat(resultGT).isEmpty(); + assertThat(lastScanEvent) + .as( + "As the predicate pushdown filter out all rows, Flink did not create scan plan, so it doesn't publish any ScanEvent.") + .isNull(); + } + + @TestTemplate + public void testFilterPushDownIsNotNull() { + String sqlNotNull = String.format("SELECT * FROM %s WHERE data IS NOT NULL", TABLE_NAME); + String expectedFilter = "not_null(ref(name=\"data\"))"; + + List resultNotNull = sql(sqlNotNull); + assertThat(resultNotNull).hasSize(2); + + List expected = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); + assertSameElements(expected, resultNotNull); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownIsNull() { + String sqlNull = String.format("SELECT * FROM %s WHERE data IS NULL", TABLE_NAME); + String expectedFilter = "is_null(ref(name=\"data\"))"; + + List resultNull = sql(sqlNull); + assertThat(resultNull).hasSize(1).first().isEqualTo(Row.of(3, null, 30.0)); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownNot() { + String sqlNot = String.format("SELECT * FROM %s WHERE NOT (id = 1 OR id = 2 ) ", TABLE_NAME); + + List resultNot = sql(sqlNot); + assertThat(resultNot).hasSize(1).first().isEqualTo(Row.of(3, null, 30.0)); + + assertThat(scanEventCount).isEqualTo(1); + String expectedFilter = "(ref(name=\"id\") != 1 and ref(name=\"id\") != 2)"; + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownBetween() { + String sqlBetween = String.format("SELECT * FROM %s WHERE id BETWEEN 1 AND 2 ", TABLE_NAME); + + List resultBetween = sql(sqlBetween); + assertThat(resultBetween).hasSize(2); + + List expectedBetween = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); + assertSameElements(expectedBetween, resultBetween); + + assertThat(scanEventCount).isEqualTo(1); + String expected = "(ref(name=\"id\") >= 1 and ref(name=\"id\") <= 2)"; + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expected); + } + + @TestTemplate + public void testFilterPushDownNotBetween() { + String sqlNotBetween = + String.format("SELECT * FROM %s WHERE id NOT BETWEEN 2 AND 3 ", TABLE_NAME); + String expectedFilter = "(ref(name=\"id\") < 2 or ref(name=\"id\") > 3)"; + + List resultNotBetween = sql(sqlNotBetween); + assertThat(resultNotBetween).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownLike() { + String expectedFilter = "ref(name=\"data\") startsWith \"\"ice\"\""; + + String sqlLike = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'ice%%' "; + List resultLike = sql(sqlLike); + assertThat(resultLike).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + + // %% won't match the row with null value + sqlLike = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%' "; + resultLike = sql(sqlLike); + assertThat(resultLike).hasSize(2); + List expectedRecords = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); + assertSameElements(expectedRecords, resultLike); + String expectedScan = "not_null(ref(name=\"data\"))"; + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedScan); + } + + @TestTemplate + public void testFilterNotPushDownLike() { + Row expectRecord = Row.of(1, "iceberg", 10.0); + String sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%i' "; + List resultLike = sql(sqlNoPushDown); + assertThat(resultLike).isEmpty(); + assertThat(lastScanEvent.filter()) + .as("Should not push down a filter") + .isEqualTo(Expressions.alwaysTrue()); + + sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%i%%' "; + resultLike = sql(sqlNoPushDown); + assertThat(resultLike).hasSize(1).first().isEqualTo(expectRecord); + assertThat(lastScanEvent.filter()) + .as("Should not push down a filter") + .isEqualTo(Expressions.alwaysTrue()); + + sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%ice%%g' "; + resultLike = sql(sqlNoPushDown); + assertThat(resultLike).hasSize(1).first().isEqualTo(expectRecord); + assertThat(lastScanEvent.filter()) + .as("Should not push down a filter") + .isEqualTo(Expressions.alwaysTrue()); + + sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'iceber_' "; + resultLike = sql(sqlNoPushDown); + assertThat(resultLike).hasSize(1).first().isEqualTo(expectRecord); + assertThat(lastScanEvent.filter()) + .as("Should not push down a filter") + .isEqualTo(Expressions.alwaysTrue()); + + sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'i%%g' "; + resultLike = sql(sqlNoPushDown); + assertThat(resultLike).hasSize(1).first().isEqualTo(expectRecord); + assertThat(lastScanEvent.filter()) + .as("Should not push down a filter") + .isEqualTo(Expressions.alwaysTrue()); + } + + @TestTemplate + public void testFilterPushDown2Literal() { + String sql2Literal = String.format("SELECT * FROM %s WHERE 1 > 0 ", TABLE_NAME); + List result = sql(sql2Literal); + List expectedRecords = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); + assertSameElements(expectedRecords, result); + assertThat(lastScanEvent.filter()) + .as("Should not push down a filter") + .isEqualTo(Expressions.alwaysTrue()); + } + + @TestTemplate + public void testSqlParseNaN() { + // todo add some test case to test NaN + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java new file mode 100644 index 000000000000..8c1e53e15f15 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.apache.iceberg.flink.SimpleDataUtil.SCHEMA; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.data.RowDataToRowMapper; +import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.TestTemplate; + +public class TestIcebergSourceBounded extends TestFlinkScan { + @TestTemplate + public void testValidation() { + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, SCHEMA); + + assertThatThrownBy( + () -> + IcebergSource.forRowData() + .tableLoader(tableLoader()) + .assignerFactory(new SimpleSplitAssignerFactory()) + .streaming(false) + .endTag("tag") + .endSnapshotId(1L) + .build()) + .hasMessage("END_SNAPSHOT_ID and END_TAG cannot both be set.") + .isInstanceOf(IllegalArgumentException.class); + } + + @Override + protected List runWithProjection(String... projected) throws Exception { + // Convert Iceberg schema to Flink schema + Schema icebergTableSchema = + CATALOG_EXTENSION.catalog().loadTable(TestFixtures.TABLE_IDENTIFIER).schema(); + ResolvedSchema fullFlinkSchema = FlinkSchemaUtil.toResolvedSchema(icebergTableSchema); + + // Projection + List projectedColumns = + Arrays.stream(projected) + .map(fullFlinkSchema::getColumn) + .flatMap(Optional::stream) + .collect(Collectors.toList()); + + // Convert back to Iceberg schema + ResolvedSchema projectedFlinkSchema = ResolvedSchema.of(projectedColumns); + Schema projectedIcebergSchema = + FlinkSchemaUtil.convert(icebergTableSchema, projectedFlinkSchema); + return run(projectedIcebergSchema, Lists.newArrayList(), Maps.newHashMap(), "", projected); + } + + @Override + protected List runWithFilter(Expression filter, String sqlFilter, boolean caseSensitive) + throws Exception { + Map options = Maps.newHashMap(); + options.put("case-sensitive", Boolean.toString(caseSensitive)); + return run(null, Collections.singletonList(filter), options, sqlFilter, "*"); + } + + @Override + protected List runWithOptions(Map options) throws Exception { + return run(null, Lists.newArrayList(), options, "", "*"); + } + + @Override + protected List run() throws Exception { + return run(null, Lists.newArrayList(), Maps.newHashMap(), "", "*"); + } + + protected List run( + Schema projectedSchema, + List filters, + Map options, + String sqlFilter, + String... sqlSelectedFields) + throws Exception { + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(1); + Configuration config = new Configuration(); + config.set(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 128); + Table table; + try (TableLoader tableLoader = tableLoader()) { + tableLoader.open(); + table = tableLoader.loadTable(); + } + + IcebergSource.Builder sourceBuilder = + IcebergSource.forRowData() + .tableLoader(tableLoader()) + .table(table) + .assignerFactory(new SimpleSplitAssignerFactory()) + .flinkConfig(config); + if (projectedSchema != null) { + sourceBuilder.project(projectedSchema); + } + + sourceBuilder.filters(filters); + sourceBuilder.setAll(options); + + DataStream stream = + sourceBuilder + .buildStream(env) + .map( + new RowDataToRowMapper( + FlinkSchemaUtil.convert( + projectedSchema == null ? table.schema() : projectedSchema))); + + try (CloseableIterator iter = stream.executeAndCollect()) { + return Lists.newArrayList(iter); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedConverterBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedConverterBase.java new file mode 100644 index 000000000000..19804dec8088 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedConverterBase.java @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; +import org.apache.iceberg.flink.source.reader.ReaderFunction; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public abstract class TestIcebergSourceBoundedConverterBase { + @TempDir protected Path temporaryFolder; + + @RegisterExtension + static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); + + @Parameters(name = "format={0}, parallelism = {1}, useConverter = {2}") + public static Object[][] parameters() { + return new Object[][] { + {FileFormat.AVRO, 2, true}, + {FileFormat.PARQUET, 2, true}, + {FileFormat.ORC, 2, true} + }; + } + + @Parameter(index = 0) + FileFormat fileFormat; + + @Parameter(index = 1) + int parallelism; + + @Parameter(index = 2) + boolean useConverter; + + @TestTemplate + public void testUnpartitionedTable() throws Exception { + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); + new GenericAppenderHelper(table, fileFormat, temporaryFolder).appendToTable(expectedRecords); + TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); + } + + @TestTemplate + public void testPartitionedTable() throws Exception { + String dateStr = "2020-03-20"; + Table table = getPartitionedTable(); + List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); + for (Record expectedRecord : expectedRecords) { + expectedRecord.setField("dt", dateStr); + } + addRecordsToPartitionedTable(table, dateStr, expectedRecords); + TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); + } + + @TestTemplate + public void testProjection() throws Exception { + Table table = getPartitionedTable(); + List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); + addRecordsToPartitionedTable(table, "2020-03-20", expectedRecords); + // select the "data" field (fieldId == 1) + Schema projectedSchema = TypeUtil.select(TestFixtures.SCHEMA, Sets.newHashSet(1)); + List expectedRows = + Arrays.asList(Row.of(expectedRecords.get(0).get(0)), Row.of(expectedRecords.get(1).get(0))); + TestHelpers.assertRows( + run(projectedSchema, Collections.emptyList(), Collections.emptyMap()), expectedRows); + } + + static Table getPartitionedTable() { + return CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + } + + static TableLoader tableLoader() { + return CATALOG_EXTENSION.tableLoader(); + } + + private void addRecordsToPartitionedTable( + Table table, String dateStr, List expectedRecords) throws IOException { + new GenericAppenderHelper(table, fileFormat, temporaryFolder) + .appendToTable(org.apache.iceberg.TestHelpers.Row.of(dateStr, 0), expectedRecords); + } + + private List run() throws Exception { + return run(null, Collections.emptyList(), Collections.emptyMap()); + } + + private List run( + Schema projectedSchema, List filters, Map options) + throws Exception { + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(parallelism); + env.getConfig().enableObjectReuse(); + + Configuration config = new Configuration(); + config.set(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 128); + Table table; + try (TableLoader tableLoader = tableLoader()) { + tableLoader.open(); + table = tableLoader.loadTable(); + } + + Schema readSchema = projectedSchema != null ? projectedSchema : table.schema(); + IcebergSource.Builder sourceBuilder = + getSourceBuilder(projectedSchema, filters, readSchema, config, table); + + if (projectedSchema != null) { + sourceBuilder.project(projectedSchema); + } + + sourceBuilder.filters(filters); + sourceBuilder.setAll(options); + + DataStream inputStream = + env.fromSource( + sourceBuilder.build(), + WatermarkStrategy.noWatermarks(), + "testBasicRead", + getTypeInfo(readSchema)); + + DataStream stream = mapToRow(inputStream, readSchema); + + try (CloseableIterator iter = stream.executeAndCollect()) { + return Lists.newArrayList(iter); + } + } + + private IcebergSource.Builder getSourceBuilder( + Schema projectedSchema, + List filters, + Schema readSchema, + Configuration config, + Table table) + throws Exception { + if (useConverter) { + return createSourceBuilderWithConverter(readSchema, config, table); + } + return createSourceBuilderWithReaderFunction(table, projectedSchema, filters, config); + } + + private IcebergSource.Builder createSourceBuilderWithConverter( + Schema readSchema, Configuration config, Table table) throws Exception { + return IcebergSource.forOutputType(getConverter(readSchema, table)) + .tableLoader(tableLoader()) + .assignerFactory(new SimpleSplitAssignerFactory()) + .flinkConfig(config); + } + + private IcebergSource.Builder createSourceBuilderWithReaderFunction( + Table table, Schema projected, List filters, Configuration config) + throws Exception { + return IcebergSource.builder() + .tableLoader(tableLoader()) + .readerFunction(getReaderFunction(projected, table, filters)) + .assignerFactory(new SimpleSplitAssignerFactory()) + .flinkConfig(config); + } + + protected abstract org.apache.iceberg.flink.source.reader.RowDataConverter getConverter( + org.apache.iceberg.Schema icebergSchema, Table table) throws Exception; + + protected ReaderFunction getReaderFunction( + org.apache.iceberg.Schema icebergSchema, Table table, List filters) + throws Exception { + throw new UnsupportedOperationException("No default implementation for getReaderFunction"); + } + + protected abstract TypeInformation getTypeInfo(Schema icebergSchema); + + protected abstract DataStream mapToRow(DataStream inputStream, Schema icebergSchema); +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java new file mode 100644 index 000000000000..faddce542285 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.util.List; +import org.apache.avro.generic.GenericRecord; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.formats.avro.typeutils.GenericRecordAvroTypeInfo; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.data.RowDataToRowMapper; +import org.apache.iceberg.flink.sink.AvroGenericRecordToRowDataMapper; +import org.apache.iceberg.flink.source.reader.AvroGenericRecordConverter; +import org.apache.iceberg.flink.source.reader.AvroGenericRecordReaderFunction; +import org.apache.iceberg.flink.source.reader.ReaderFunction; +import org.apache.iceberg.flink.source.reader.RowDataConverter; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestIcebergSourceBoundedGenericRecord + extends TestIcebergSourceBoundedConverterBase { + + @Parameters(name = "format={0}, parallelism = {1}, useConverter = {2}") + public static Object[][] parameters() { + return new Object[][] { + {FileFormat.AVRO, 2, true}, + {FileFormat.PARQUET, 2, true}, + {FileFormat.PARQUET, 2, false}, + {FileFormat.ORC, 2, true} + }; + } + + @Override + protected RowDataConverter getConverter(Schema icebergSchema, Table table) { + return AvroGenericRecordConverter.fromIcebergSchema(icebergSchema, table.name()); + } + + @Override + protected ReaderFunction getReaderFunction( + Schema icebergSchema, Table table, List filters) throws Exception { + return new AvroGenericRecordReaderFunction( + TestFixtures.TABLE_IDENTIFIER.name(), + new Configuration(), + table.schema(), + icebergSchema, + null, + false, + table.io(), + table.encryption(), + filters); + } + + @Override + protected TypeInformation getTypeInfo(Schema icebergSchema) { + org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, TestFixtures.TABLE_IDENTIFIER.name()); + return new GenericRecordAvroTypeInfo(avroSchema); + } + + @Override + protected DataStream mapToRow(DataStream inputStream, Schema icebergSchema) { + RowType rowType = FlinkSchemaUtil.convert(icebergSchema); + org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, TestFixtures.TABLE_IDENTIFIER.name()); + return inputStream + .map(AvroGenericRecordToRowDataMapper.forAvroSchema(avroSchema)) + .map(new RowDataToRowMapper(rowType)); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedRow.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedRow.java new file mode 100644 index 000000000000..13087bc0a06a --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedRow.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.runtime.typeutils.ExternalTypeInfo; +import org.apache.flink.types.Row; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.source.reader.RowConverter; +import org.apache.iceberg.flink.source.reader.RowDataConverter; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestIcebergSourceBoundedRow extends TestIcebergSourceBoundedConverterBase { + + @Override + protected RowDataConverter getConverter(Schema icebergSchema, Table table) { + return RowConverter.fromIcebergSchema(icebergSchema); + } + + @Override + protected TypeInformation getTypeInfo(Schema icebergSchema) { + ResolvedSchema resolvedSchema = FlinkSchemaUtil.toResolvedSchema(icebergSchema); + TypeInformation[] types = + resolvedSchema.getColumnDataTypes().stream() + .map(ExternalTypeInfo::of) + .toArray(TypeInformation[]::new); + String[] fieldNames = resolvedSchema.getColumnNames().toArray(String[]::new); + return new RowTypeInfo(types, fieldNames); + } + + @Override + protected DataStream mapToRow(DataStream inputStream, Schema icebergSchema) { + return inputStream; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java new file mode 100644 index 000000000000..d3713e296014 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.config.TableConfigOptions; +import org.apache.flink.types.Row; +import org.apache.iceberg.Schema; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.junit.jupiter.api.BeforeEach; + +public class TestIcebergSourceBoundedSql extends TestIcebergSourceBounded { + private volatile TableEnvironment tEnv; + + @BeforeEach + public void before() throws IOException { + Configuration tableConf = getTableEnv().getConfig().getConfiguration(); + tableConf.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE, true); + SqlHelpers.sql( + getTableEnv(), + "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + CATALOG_EXTENSION.warehouse()); + SqlHelpers.sql(getTableEnv(), "use catalog iceberg_catalog"); + getTableEnv() + .getConfig() + .getConfiguration() + .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); + } + + private TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + if (tEnv == null) { + this.tEnv = + TableEnvironment.create(EnvironmentSettings.newInstance().inBatchMode().build()); + } + } + } + return tEnv; + } + + @Override + protected List run( + Schema projectedSchema, + List filters, + Map options, + String sqlFilter, + String... sqlSelectedFields) + throws Exception { + String select = String.join(",", sqlSelectedFields); + String optionStr = SqlHelpers.sqlOptionsToString(options); + return SqlHelpers.sql(getTableEnv(), "select %s from t %s %s", select, optionStr, sqlFilter); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java new file mode 100644 index 000000000000..749cbf89338a --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java @@ -0,0 +1,573 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.nio.file.Path; +import java.time.Duration; +import java.util.Collection; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; +import org.apache.flink.api.common.JobID; +import org.apache.flink.api.common.JobStatus; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.client.program.ClusterClient; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.runtime.client.JobStatusMessage; +import org.apache.flink.runtime.testutils.InMemoryReporter; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.RowData; +import org.apache.flink.test.junit5.InjectClusterClient; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.HadoopTableExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.flink.data.RowDataToRowMapper; +import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +public class TestIcebergSourceContinuous { + + public static final InMemoryReporter METRIC_REPORTER = InMemoryReporter.create(); + + @TempDir protected Path temporaryFolder; + + @RegisterExtension + public static MiniClusterExtension miniClusterExtension = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(METRIC_REPORTER); + + @RegisterExtension + private static final HadoopTableExtension TABLE_EXTENSION = + new HadoopTableExtension(TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); + + private final AtomicLong randomSeed = new AtomicLong(0L); + + @Test + public void testTableScanThenIncremental() throws Exception { + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); + + // snapshot1 + List batch1 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch1); + + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + List result1 = waitForResult(iter, 2); + TestHelpers.assertRecords(result1, batch1, TABLE_EXTENSION.table().schema()); + + // snapshot2 + List batch2 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch2); + TABLE_EXTENSION.table().currentSnapshot().snapshotId(); + + List result2 = waitForResult(iter, 2); + TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); + + // snapshot3 + List batch3 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch3); + TABLE_EXTENSION.table().currentSnapshot().snapshotId(); + + List result3 = waitForResult(iter, 2); + TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); + + assertThatIcebergEnumeratorMetricsExist(); + } + } + + @Test + public void testTableScanThenIncrementalAfterExpiration() throws Exception { + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); + + // snapshot1 + List batch1 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch1); + long snapshotId = TABLE_EXTENSION.table().currentSnapshot().snapshotId(); + + // snapshot2 + List batch2 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch2); + + TABLE_EXTENSION.table().expireSnapshots().expireSnapshotId(snapshotId).commit(); + + assertThat(TABLE_EXTENSION.table().history()).hasSize(1); + + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + + assertThat(FlinkSplitPlanner.checkScanMode(scanContext)) + .isEqualTo(FlinkSplitPlanner.ScanMode.BATCH); + + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + List result1 = waitForResult(iter, 4); + List initialRecords = Lists.newArrayList(); + initialRecords.addAll(batch1); + initialRecords.addAll(batch2); + TestHelpers.assertRecords(result1, initialRecords, TABLE_EXTENSION.table().schema()); + + // snapshot3 + List batch3 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch3); + TABLE_EXTENSION.table().currentSnapshot().snapshotId(); + + List result3 = waitForResult(iter, 2); + TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); + + assertThatIcebergEnumeratorMetricsExist(); + } + } + + @Test + public void testEarliestSnapshot() throws Exception { + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); + + // snapshot0 + List batch0 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch0); + + // snapshot1 + List batch1 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch1); + + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .build(); + + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + List result1 = waitForResult(iter, 4); + List combinedBatch0AndBatch1 = Lists.newArrayList(batch0); + combinedBatch0AndBatch1.addAll(batch1); + TestHelpers.assertRecords(result1, combinedBatch0AndBatch1, TABLE_EXTENSION.table().schema()); + + // snapshot2 + List batch2 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch2); + + List result2 = waitForResult(iter, 2); + TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); + + // snapshot3 + List batch3 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch3); + + List result3 = waitForResult(iter, 2); + TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); + + assertThatIcebergEnumeratorMetricsExist(); + } + } + + @Test + public void testLatestSnapshot(@InjectClusterClient ClusterClient clusterClient) + throws Exception { + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); + + // snapshot0 + List batch0 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch0); + + // snapshot1 + List batch1 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch1); + + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .build(); + + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + // we want to make sure job is running first so that enumerator can + // start from the latest snapshot before inserting the next batch2 below. + waitUntilJobIsRunning(clusterClient); + + // inclusive behavior for starting snapshot + List result1 = waitForResult(iter, 2); + TestHelpers.assertRecords(result1, batch1, TABLE_EXTENSION.table().schema()); + + // snapshot2 + List batch2 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch2); + + List result2 = waitForResult(iter, 2); + TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); + + // snapshot3 + List batch3 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch3); + + List result3 = waitForResult(iter, 2); + TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); + + assertThatIcebergEnumeratorMetricsExist(); + } + } + + @Test + public void testSpecificSnapshotId() throws Exception { + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); + + // snapshot0 + List batch0 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch0); + long snapshot0 = TABLE_EXTENSION.table().currentSnapshot().snapshotId(); + + // snapshot1 + List batch1 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch1); + long snapshot1 = TABLE_EXTENSION.table().currentSnapshot().snapshotId(); + + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(snapshot1) + .build(); + + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + List result1 = waitForResult(iter, 2); + TestHelpers.assertRecords(result1, batch1, TABLE_EXTENSION.table().schema()); + + // snapshot2 + List batch2 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch2); + + List result2 = waitForResult(iter, 2); + TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); + + // snapshot3 + List batch3 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch3); + + List result3 = waitForResult(iter, 2); + TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); + + assertThatIcebergEnumeratorMetricsExist(); + } + } + + @Test + public void testSpecificSnapshotTimestamp() throws Exception { + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); + + // snapshot0 + List batch0 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch0); + long snapshot0Timestamp = TABLE_EXTENSION.table().currentSnapshot().timestampMillis(); + + // sleep for 2 ms to make sure snapshot1 has a higher timestamp value + Thread.sleep(2); + + // snapshot1 + List batch1 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch1); + long snapshot1Timestamp = TABLE_EXTENSION.table().currentSnapshot().timestampMillis(); + + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(snapshot1Timestamp) + .build(); + + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + // consume data from snapshot1 + List result1 = waitForResult(iter, 2); + TestHelpers.assertRecords(result1, batch1, TABLE_EXTENSION.table().schema()); + + // snapshot2 + List batch2 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch2); + + List result2 = waitForResult(iter, 2); + TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); + + // snapshot3 + List batch3 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch3); + + List result3 = waitForResult(iter, 2); + TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); + + assertThatIcebergEnumeratorMetricsExist(); + } + } + + @Test + public void testReadingFromBranch() throws Exception { + String branch = "b1"; + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); + + List batchBase = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batchBase); + + // create branch + TABLE_EXTENSION + .table() + .manageSnapshots() + .createBranch(branch, TABLE_EXTENSION.table().currentSnapshot().snapshotId()) + .commit(); + + // snapshot1 to branch + List batch1 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(branch, batch1); + + // snapshot2 to branch + List batch2 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(branch, batch2); + + List branchExpectedRecords = Lists.newArrayList(); + branchExpectedRecords.addAll(batchBase); + branchExpectedRecords.addAll(batch1); + branchExpectedRecords.addAll(batch2); + // reads from branch: it should contain the first snapshot (before the branch creation) followed + // by the next 2 snapshots added + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .useBranch(branch) + .build(); + + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + List resultMain = waitForResult(iter, 6); + TestHelpers.assertRecords( + resultMain, branchExpectedRecords, TABLE_EXTENSION.table().schema()); + + // snapshot3 to branch + List batch3 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(branch, batch3); + + List result3 = waitForResult(iter, 2); + TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); + + // snapshot4 to branch + List batch4 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(branch, batch4); + + List result4 = waitForResult(iter, 2); + TestHelpers.assertRecords(result4, batch4, TABLE_EXTENSION.table().schema()); + } + + // read only from main branch. Should contain only the first snapshot + scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + List resultMain = waitForResult(iter, 2); + TestHelpers.assertRecords(resultMain, batchBase, TABLE_EXTENSION.table().schema()); + + List batchMain2 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batchMain2); + resultMain = waitForResult(iter, 2); + TestHelpers.assertRecords(resultMain, batchMain2, TABLE_EXTENSION.table().schema()); + } + } + + @Test + public void testValidation() { + assertThatThrownBy( + () -> + IcebergSource.forRowData() + .tableLoader(TABLE_EXTENSION.tableLoader()) + .assignerFactory(new SimpleSplitAssignerFactory()) + .streaming(true) + .endTag("tag") + .build()) + .hasMessage("Cannot set end-tag option for streaming reader") + .isInstanceOf(IllegalArgumentException.class); + } + + private DataStream createStream(ScanContext scanContext) throws Exception { + // start the source and collect output + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(1); + DataStream stream = + env.fromSource( + IcebergSource.forRowData() + .tableLoader(TABLE_EXTENSION.tableLoader()) + .assignerFactory(new SimpleSplitAssignerFactory()) + .streaming(scanContext.isStreaming()) + .streamingStartingStrategy(scanContext.streamingStartingStrategy()) + .startSnapshotTimestamp(scanContext.startSnapshotTimestamp()) + .startSnapshotId(scanContext.startSnapshotId()) + .monitorInterval(Duration.ofMillis(10L)) + .branch(scanContext.branch()) + .build(), + WatermarkStrategy.noWatermarks(), + "icebergSource", + TypeInformation.of(RowData.class)) + .map(new RowDataToRowMapper(FlinkSchemaUtil.convert(TABLE_EXTENSION.table().schema()))); + return stream; + } + + public static List waitForResult(CloseableIterator iter, int limit) { + List results = Lists.newArrayListWithCapacity(limit); + while (results.size() < limit) { + if (iter.hasNext()) { + results.add(iter.next()); + } else { + break; + } + } + return results; + } + + public static void waitUntilJobIsRunning(ClusterClient client) { + Awaitility.await("job should be running") + .atMost(Duration.ofSeconds(30)) + .pollInterval(Duration.ofMillis(10)) + .untilAsserted(() -> assertThat(getRunningJobs(client)).isNotEmpty()); + } + + public static List getRunningJobs(ClusterClient client) throws Exception { + Collection statusMessages = client.listJobs().get(); + return statusMessages.stream() + .filter(status -> status.getJobState() == JobStatus.RUNNING) + .map(JobStatusMessage::getJobId) + .collect(Collectors.toList()); + } + + private static void assertThatIcebergEnumeratorMetricsExist() { + assertThatIcebergSourceMetricExists( + "enumerator", "coordinator.enumerator.elapsedSecondsSinceLastSplitDiscovery"); + assertThatIcebergSourceMetricExists("enumerator", "coordinator.enumerator.unassignedSplits"); + assertThatIcebergSourceMetricExists("enumerator", "coordinator.enumerator.pendingRecords"); + } + + private static void assertThatIcebergSourceMetricExists( + String metricGroupPattern, String metricName) { + Optional groups = METRIC_REPORTER.findGroup(metricGroupPattern); + assertThat(groups).isPresent(); + assertThat( + METRIC_REPORTER.getMetricsByGroup(groups.get()).keySet().stream() + .map(name -> groups.get().getMetricIdentifier(name))) + .satisfiesOnlyOnce( + fullMetricName -> assertThat(fullMetricName).containsSubsequence(metricName)); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java new file mode 100644 index 000000000000..310c8be2eff6 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java @@ -0,0 +1,397 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.apache.iceberg.flink.SimpleDataUtil.tableRecords; +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.file.Path; +import java.time.Duration; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.flink.api.common.JobID; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.client.program.ClusterClient; +import org.apache.flink.configuration.CheckpointingOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.RestartStrategyOptions; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.core.execution.SavepointFormatType; +import org.apache.flink.runtime.highavailability.nonha.embedded.HaLeadershipControl; +import org.apache.flink.runtime.minicluster.MiniCluster; +import org.apache.flink.runtime.minicluster.RpcServiceSharing; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.RowData; +import org.apache.flink.test.junit5.InjectClusterClient; +import org.apache.flink.test.junit5.InjectMiniCluster; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.FlinkReadOptions; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.sink.FlinkSink; +import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +@Timeout(value = 120) +public class TestIcebergSourceFailover { + + // Parallelism higher than 1, but lower than the number of splits used by some of our tests + // The goal is to allow some splits to remain in the enumerator when restoring the state + private static final int PARALLELISM = 2; + private static final int DO_NOT_FAIL = Integer.MAX_VALUE; + protected static final MiniClusterResourceConfiguration MINI_CLUSTER_RESOURCE_CONFIG = + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(1) + .setNumberSlotsPerTaskManager(PARALLELISM) + .setRpcServiceSharing(RpcServiceSharing.DEDICATED) + .withHaLeadershipControl() + .build(); + + @RegisterExtension + public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = + new MiniClusterExtension(MINI_CLUSTER_RESOURCE_CONFIG); + + @TempDir protected Path temporaryFolder; + + @RegisterExtension + protected static final HadoopCatalogExtension SOURCE_CATALOG_EXTENSION = + new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); + + @RegisterExtension + protected static final HadoopCatalogExtension SINK_CATALOG_EXTENSION = + new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.SINK_TABLE); + + protected Table sourceTable; + protected Table sinkTable; + + @BeforeEach + protected void setupTable() { + this.sourceTable = + SOURCE_CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + this.sinkTable = + SINK_CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.SINK_TABLE_IDENTIFIER, TestFixtures.SCHEMA); + } + + @BeforeEach + protected void startMiniCluster(@InjectMiniCluster MiniCluster miniCluster) throws Exception { + if (!miniCluster.isRunning()) { + miniCluster.start(); + } + } + + @AfterEach + protected void stopMiniCluster(@InjectMiniCluster MiniCluster miniCluster) throws Exception { + miniCluster.close(); + } + + protected IcebergSource.Builder sourceBuilder() { + Configuration config = new Configuration(); + return IcebergSource.forRowData() + .tableLoader(SOURCE_CATALOG_EXTENSION.tableLoader()) + .assignerFactory(new SimpleSplitAssignerFactory()) + // Prevent combining splits + .set( + FlinkReadOptions.SPLIT_FILE_OPEN_COST, + Long.toString(TableProperties.SPLIT_SIZE_DEFAULT)) + .flinkConfig(config); + } + + protected Schema schema() { + return TestFixtures.SCHEMA; + } + + protected List generateRecords(int numRecords, long seed) { + return RandomGenericData.generate(schema(), numRecords, seed); + } + + protected void assertRecords(Table table, List expectedRecords, Duration timeout) + throws Exception { + SimpleDataUtil.assertTableRecords(table, expectedRecords, timeout); + } + + @Disabled("Disabled for now as it is flaky on CI") + @Test + public void testBoundedWithSavepoint(@InjectClusterClient ClusterClient clusterClient) + throws Exception { + List expectedRecords = Lists.newArrayList(); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(sourceTable, FileFormat.PARQUET, temporaryFolder); + for (int i = 0; i < 4; ++i) { + List records = generateRecords(2, i); + expectedRecords.addAll(records); + dataAppender.appendToTable(records); + } + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + createBoundedStreams(env, 2); + + JobClient jobClient = env.executeAsync("Bounded Iceberg Source Savepoint Test"); + JobID jobId = jobClient.getJobID(); + + // Write something, but do not finish before checkpoint is created + RecordCounterToWait.waitForCondition(); + CompletableFuture savepoint = + clusterClient.stopWithSavepoint( + jobId, false, temporaryFolder.toString(), SavepointFormatType.CANONICAL); + RecordCounterToWait.continueProcessing(); + + // Wait for the job to stop with the savepoint + String savepointPath = savepoint.get(); + + // We expect that at least a few records has written + assertThat(tableRecords(sinkTable)).hasSizeGreaterThan(0); + + // New env from the savepoint + Configuration conf = new Configuration(); + conf.set(CheckpointingOptions.SAVEPOINT_DIRECTORY, savepointPath); + env = StreamExecutionEnvironment.getExecutionEnvironment(conf); + createBoundedStreams(env, DO_NOT_FAIL); + + env.execute("Bounded Iceberg Source Savepoint Test"); + + // We expect no duplications + assertRecords(sinkTable, expectedRecords, Duration.ofSeconds(120)); + } + + @Test + public void testBoundedWithTaskManagerFailover(@InjectMiniCluster MiniCluster miniCluster) + throws Exception { + testBoundedIcebergSource(FailoverType.TM, miniCluster); + } + + @Test + public void testBoundedWithJobManagerFailover(@InjectMiniCluster MiniCluster miniCluster) + throws Exception { + testBoundedIcebergSource(FailoverType.JM, miniCluster); + } + + private void testBoundedIcebergSource(FailoverType failoverType, MiniCluster miniCluster) + throws Exception { + List expectedRecords = Lists.newArrayList(); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(sourceTable, FileFormat.PARQUET, temporaryFolder); + for (int i = 0; i < 4; ++i) { + List records = generateRecords(2, i); + expectedRecords.addAll(records); + dataAppender.appendToTable(records); + } + + Configuration config = new Configuration(); + config.set( + RestartStrategyOptions.RESTART_STRATEGY, + RestartStrategyOptions.RestartStrategyType.FIXED_DELAY.getMainValue()); + config.set(RestartStrategyOptions.RESTART_STRATEGY_FIXED_DELAY_ATTEMPTS, 1); + config.set(RestartStrategyOptions.RESTART_STRATEGY_FIXED_DELAY_DELAY, Duration.ofSeconds(0)); + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(config); + createBoundedStreams(env, 2); + + JobClient jobClient = env.executeAsync("Bounded Iceberg Source Failover Test"); + JobID jobId = jobClient.getJobID(); + + RecordCounterToWait.waitForCondition(); + triggerFailover(failoverType, jobId, RecordCounterToWait::continueProcessing, miniCluster); + + assertRecords(sinkTable, expectedRecords, Duration.ofSeconds(120)); + } + + @Test + public void testContinuousWithTaskManagerFailover(@InjectMiniCluster MiniCluster miniCluster) + throws Exception { + testContinuousIcebergSource(FailoverType.TM, miniCluster); + } + + @Test + public void testContinuousWithJobManagerFailover(@InjectMiniCluster MiniCluster miniCluster) + throws Exception { + testContinuousIcebergSource(FailoverType.JM, miniCluster); + } + + private void testContinuousIcebergSource(FailoverType failoverType, MiniCluster miniCluster) + throws Exception { + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(sourceTable, FileFormat.PARQUET, temporaryFolder); + List expectedRecords = Lists.newArrayList(); + + List batch = generateRecords(2, 0); + expectedRecords.addAll(batch); + dataAppender.appendToTable(batch); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(PARALLELISM); + env.enableCheckpointing(10L); + Configuration config = new Configuration(); + config.set(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 128); + + DataStream stream = + env.fromSource( + sourceBuilder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10)) + .streamingStartingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(), + WatermarkStrategy.noWatermarks(), + "IcebergSource", + TypeInformation.of(RowData.class)); + + // CollectStreamSink from DataStream#executeAndCollect() doesn't guarantee + // exactly-once behavior. When Iceberg sink, we can verify end-to-end + // exactly-once. Here we mainly about source exactly-once behavior. + FlinkSink.forRowData(stream) + .table(sinkTable) + .tableLoader(SINK_CATALOG_EXTENSION.tableLoader()) + .append(); + + JobClient jobClient = env.executeAsync("Continuous Iceberg Source Failover Test"); + JobID jobId = jobClient.getJobID(); + + for (int i = 1; i < 5; i++) { + Thread.sleep(10); + List records = generateRecords(2, i); + expectedRecords.addAll(records); + dataAppender.appendToTable(records); + if (i == 2) { + triggerFailover(failoverType, jobId, () -> {}, miniCluster); + } + } + + // wait longer for continuous source to reduce flakiness + // because CI servers tend to be overloaded. + assertRecords(sinkTable, expectedRecords, Duration.ofSeconds(120)); + } + + private void createBoundedStreams(StreamExecutionEnvironment env, int failAfter) { + env.setParallelism(PARALLELISM); + + DataStream stream = + env.fromSource( + sourceBuilder().build(), + WatermarkStrategy.noWatermarks(), + "IcebergSource", + TypeInformation.of(RowData.class)); + + DataStream streamFailingInTheMiddleOfReading = + RecordCounterToWait.wrapWithFailureAfter(stream, failAfter); + + // CollectStreamSink from DataStream#executeAndCollect() doesn't guarantee + // exactly-once behavior. When Iceberg sink, we can verify end-to-end + // exactly-once. Here we mainly about source exactly-once behavior. + FlinkSink.forRowData(streamFailingInTheMiddleOfReading) + .table(sinkTable) + .tableLoader(SINK_CATALOG_EXTENSION.tableLoader()) + .append(); + } + + // ------------------------------------------------------------------------ + // test utilities copied from Flink's FileSourceTextLinesITCase + // ------------------------------------------------------------------------ + + private enum FailoverType { + NONE, + TM, + JM + } + + private static void triggerFailover( + FailoverType type, JobID jobId, Runnable afterFailAction, MiniCluster miniCluster) + throws Exception { + switch (type) { + case NONE: + afterFailAction.run(); + break; + case TM: + restartTaskManager(afterFailAction, miniCluster); + break; + case JM: + triggerJobManagerFailover(jobId, afterFailAction, miniCluster); + break; + } + } + + private static void triggerJobManagerFailover( + JobID jobId, Runnable afterFailAction, MiniCluster miniCluster) throws Exception { + HaLeadershipControl haLeadershipControl = miniCluster.getHaLeadershipControl().get(); + haLeadershipControl.revokeJobMasterLeadership(jobId).get(); + afterFailAction.run(); + haLeadershipControl.grantJobMasterLeadership(jobId).get(); + } + + private static void restartTaskManager(Runnable afterFailAction, MiniCluster miniCluster) + throws Exception { + miniCluster.terminateTaskManager(0).get(); + afterFailAction.run(); + miniCluster.startTaskManager(); + } + + private static class RecordCounterToWait { + + private static AtomicInteger records; + private static CountDownLatch countDownLatch; + private static CompletableFuture continueProcessing; + + private static DataStream wrapWithFailureAfter(DataStream stream, int condition) { + + records = new AtomicInteger(); + continueProcessing = new CompletableFuture<>(); + countDownLatch = new CountDownLatch(stream.getParallelism()); + return stream.map( + record -> { + boolean reachedFailPoint = records.incrementAndGet() > condition; + boolean notFailedYet = countDownLatch.getCount() != 0; + if (notFailedYet && reachedFailPoint) { + countDownLatch.countDown(); + continueProcessing.get(); + } + return record; + }); + } + + private static void waitForCondition() throws InterruptedException { + countDownLatch.await(); + } + + private static void continueProcessing() { + continueProcessing.complete(null); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java new file mode 100644 index 000000000000..4f61d2f7308a --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.time.Duration; +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.ZoneOffset; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkReadOptions; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.types.Comparators; +import org.apache.iceberg.util.StructLikeWrapper; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.BeforeEach; + +public class TestIcebergSourceFailoverWithWatermarkExtractor extends TestIcebergSourceFailover { + // Increment ts by 15 minutes for each generateRecords batch + private static final long RECORD_BATCH_TS_INCREMENT_MILLI = TimeUnit.MINUTES.toMillis(15); + // Within a batch, increment ts by 1 second + private static final long RECORD_TS_INCREMENT_MILLI = TimeUnit.SECONDS.toMillis(1); + + private final AtomicLong tsMilli = new AtomicLong(System.currentTimeMillis()); + + @Override + @BeforeEach + protected void setupTable() { + this.sourceTable = + SOURCE_CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.TS_SCHEMA); + this.sinkTable = + SINK_CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.SINK_TABLE_IDENTIFIER, TestFixtures.TS_SCHEMA); + } + + @Override + protected IcebergSource.Builder sourceBuilder() { + Configuration config = new Configuration(); + return IcebergSource.forRowData() + .tableLoader(SOURCE_CATALOG_EXTENSION.tableLoader()) + .watermarkColumn("ts") + .project(TestFixtures.TS_SCHEMA) + // Prevent combining splits + .set( + FlinkReadOptions.SPLIT_FILE_OPEN_COST, + Long.toString(TableProperties.SPLIT_SIZE_DEFAULT)) + .flinkConfig(config); + } + + @Override + protected Schema schema() { + return TestFixtures.TS_SCHEMA; + } + + @Override + protected List generateRecords(int numRecords, long seed) { + // Override the ts field to create a more realistic situation for event time alignment + tsMilli.addAndGet(RECORD_BATCH_TS_INCREMENT_MILLI); + return RandomGenericData.generate(schema(), numRecords, seed).stream() + .peek( + record -> { + LocalDateTime ts = + LocalDateTime.ofInstant( + Instant.ofEpochMilli(tsMilli.addAndGet(RECORD_TS_INCREMENT_MILLI)), + ZoneId.of("Z")); + record.setField("ts", ts); + }) + .collect(Collectors.toList()); + } + + /** + * This override is needed because {@link Comparators} used by {@link StructLikeWrapper} retrieves + * Timestamp type using Long type as inner class, while the {@link RandomGenericData} generates + * {@link LocalDateTime} for {@code TimestampType.withoutZone()}. This method normalizes the + * {@link LocalDateTime} to a Long type so that Comparators can continue to work. + */ + @Override + protected void assertRecords(Table table, List expectedRecords, Duration timeout) + throws Exception { + List expectedNormalized = convertLocalDateTimeToMilli(expectedRecords); + Awaitility.await("expected list of records should be produced") + .atMost(timeout) + .untilAsserted( + () -> + SimpleDataUtil.assertRecordsEqual( + expectedNormalized, + convertLocalDateTimeToMilli(SimpleDataUtil.tableRecords(table)), + table.schema())); + } + + private List convertLocalDateTimeToMilli(List records) { + return records.stream() + .peek( + r -> { + LocalDateTime localDateTime = ((LocalDateTime) r.getField("ts")); + r.setField("ts", localDateTime.atZone(ZoneOffset.UTC).toInstant().toEpochMilli()); + }) + .collect(Collectors.toList()); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceInferParallelism.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceInferParallelism.java new file mode 100644 index 000000000000..2908cb927269 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceInferParallelism.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.lang.reflect.Field; +import java.nio.file.Path; +import java.util.List; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.runtime.executiongraph.AccessExecutionGraph; +import org.apache.flink.runtime.executiongraph.AccessExecutionJobVertex; +import org.apache.flink.runtime.minicluster.MiniCluster; +import org.apache.flink.runtime.testutils.InternalMiniClusterExtension; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Table; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.data.RowDataToRowMapper; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +public class TestIcebergSourceInferParallelism { + private static final int NUM_TMS = 2; + private static final int SLOTS_PER_TM = 2; + private static final int PARALLELISM = NUM_TMS * SLOTS_PER_TM; + private static final int MAX_INFERRED_PARALLELISM = 3; + + @RegisterExtension + private static final MiniClusterExtension MINI_CLUSTER_EXTENSION = + new MiniClusterExtension( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(NUM_TMS) + .setNumberSlotsPerTaskManager(SLOTS_PER_TM) + .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) + .build()); + + @RegisterExtension + protected static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); + + @TempDir private Path tmpDir; + + private Table table; + private GenericAppenderHelper dataAppender; + + @BeforeEach + public void before() throws IOException { + this.table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + this.dataAppender = new GenericAppenderHelper(table, FileFormat.PARQUET, tmpDir); + } + + @AfterEach + public void after() { + CATALOG_EXTENSION.catalog().dropTable(TestFixtures.TABLE_IDENTIFIER); + } + + @Test + public void testEmptyTable() throws Exception { + // Inferred parallelism should be at least 1 even if table is empty + test(1, 0); + } + + @Test + public void testTableWithFilesLessThanMaxInferredParallelism() throws Exception { + // Append files to the table + for (int i = 0; i < 2; ++i) { + List batch = RandomGenericData.generate(table.schema(), 1, 0); + dataAppender.appendToTable(batch); + } + + // Inferred parallelism should equal to 2 splits + test(2, 2); + } + + @Test + public void testTableWithFilesMoreThanMaxInferredParallelism() throws Exception { + // Append files to the table + for (int i = 0; i < MAX_INFERRED_PARALLELISM + 1; ++i) { + List batch = RandomGenericData.generate(table.schema(), 1, 0); + dataAppender.appendToTable(batch); + } + + // Inferred parallelism should be capped by the MAX_INFERRED_PARALLELISM + test(MAX_INFERRED_PARALLELISM, MAX_INFERRED_PARALLELISM + 1); + } + + private void test(int expectedParallelism, int expectedRecords) throws Exception { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(PARALLELISM); + + Configuration config = new Configuration(); + config.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, true); + config.set( + FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX, + MAX_INFERRED_PARALLELISM); + + DataStream dataStream = + IcebergSource.forRowData() + .tableLoader(CATALOG_EXTENSION.tableLoader()) + .table(table) + .flinkConfig(config) + // force one file per split + .splitSize(1L) + .buildStream(env) + .map(new RowDataToRowMapper(FlinkSchemaUtil.convert(table.schema()))); + + DataStream.Collector collector = new DataStream.Collector<>(); + dataStream.collectAsync(collector); + JobClient jobClient = env.executeAsync(); + try (CloseableIterator iterator = collector.getOutput()) { + List result = Lists.newArrayList(); + while (iterator.hasNext()) { + result.add(iterator.next()); + } + + assertThat(result).hasSize(expectedRecords); + verifySourceParallelism( + expectedParallelism, miniCluster().getExecutionGraph(jobClient.getJobID()).get()); + } + } + + /** + * Borrowed this approach from Flink {@code FileSourceTextLinesITCase} to get source parallelism + * from execution graph. + */ + private static void verifySourceParallelism( + int expectedParallelism, AccessExecutionGraph executionGraph) { + AccessExecutionJobVertex sourceVertex = + executionGraph.getVerticesTopologically().iterator().next(); + assertThat(sourceVertex.getParallelism()).isEqualTo(expectedParallelism); + } + + /** + * Use reflection to get {@code InternalMiniClusterExtension} and {@code MiniCluster} to get + * execution graph and source parallelism. Haven't find other way via public APIS. + */ + private static MiniCluster miniCluster() throws Exception { + Field privateField = + MiniClusterExtension.class.getDeclaredField("internalMiniClusterExtension"); + privateField.setAccessible(true); + InternalMiniClusterExtension internalExtension = + (InternalMiniClusterExtension) privateField.get(MINI_CLUSTER_EXTENSION); + return internalExtension.getMiniCluster(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java new file mode 100644 index 000000000000..df148c212ebd --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.util.CloseableIterator; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.CatalogLoader; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.StructLikeSet; +import org.junit.jupiter.api.extension.RegisterExtension; + +public class TestIcebergSourceReaderDeletes extends TestFlinkReaderDeletesBase { + + private static final int PARALLELISM = 4; + + @RegisterExtension + private static final MiniClusterExtension MINI_CLUSTER = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + @Override + protected StructLikeSet rowSet(String tableName, Table testTable, String... columns) + throws IOException { + Schema projected = testTable.schema().select(columns); + RowType rowType = FlinkSchemaUtil.convert(projected); + + Map properties = Maps.newHashMap(); + properties.put( + CatalogProperties.WAREHOUSE_LOCATION, + hiveConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); + properties.put(CatalogProperties.URI, hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname)); + properties.put( + CatalogProperties.CLIENT_POOL_SIZE, + Integer.toString(hiveConf.getInt("iceberg.hive.client-pool-size", 5))); + CatalogLoader hiveCatalogLoader = CatalogLoader.hive(catalog.name(), hiveConf, properties); + TableLoader hiveTableLoader = + TableLoader.fromCatalog(hiveCatalogLoader, TableIdentifier.of("default", tableName)); + hiveTableLoader.open(); + try (TableLoader tableLoader = hiveTableLoader) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(1); + DataStream stream = + env.fromSource( + IcebergSource.builder() + .tableLoader(tableLoader) + .assignerFactory(new SimpleSplitAssignerFactory()) + .project(projected) + .build(), + WatermarkStrategy.noWatermarks(), + "testBasicRead", + TypeInformation.of(RowData.class)); + + try (CloseableIterator iter = stream.executeAndCollect()) { + List rowDataList = Lists.newArrayList(iter); + StructLikeSet set = StructLikeSet.create(projected.asStruct()); + rowDataList.forEach( + rowData -> { + RowDataWrapper wrapper = new RowDataWrapper(rowType, projected.asStruct()); + set.add(wrapper.wrap(rowData)); + }); + return set; + } catch (Exception e) { + throw new IOException("Failed to collect result", e); + } + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java new file mode 100644 index 000000000000..0cdaf8371cbd --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java @@ -0,0 +1,233 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.time.Instant; +import java.time.ZoneId; +import java.util.List; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.config.TableConfigOptions; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +/** Use the IcebergSource (FLIP-27) */ +public class TestIcebergSourceSql extends TestSqlBase { + private static final Schema SCHEMA_TS = + new Schema( + required(1, "t1", Types.TimestampType.withoutZone()), + required(2, "t2", Types.LongType.get())); + + @BeforeEach + @Override + public void before() throws IOException { + setUpTableEnv(getTableEnv()); + setUpTableEnv(getStreamingTableEnv()); + } + + private static void setUpTableEnv(TableEnvironment tableEnvironment) { + Configuration tableConf = tableEnvironment.getConfig().getConfiguration(); + tableConf.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE, true); + // Disable inferring parallelism to avoid interfering watermark tests + // that check split assignment is ordered by the watermark column. + // The tests assumes default parallelism of 1 with single reader task + // in order to check the order of read records. + tableConf.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); + + tableEnvironment.getConfig().set("table.exec.resource.default-parallelism", "1"); + SqlHelpers.sql( + tableEnvironment, + "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + CATALOG_EXTENSION.warehouse()); + SqlHelpers.sql(tableEnvironment, "use catalog iceberg_catalog"); + + tableConf.set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); + } + + @AfterEach + public void after() throws IOException { + CATALOG_EXTENSION.catalog().dropTable(TestFixtures.TABLE_IDENTIFIER); + } + + private Record generateRecord(Instant t1, long t2) { + Record record = GenericRecord.create(SCHEMA_TS); + record.setField("t1", t1.atZone(ZoneId.systemDefault()).toLocalDateTime()); + record.setField("t2", t2); + return record; + } + + /** Generates the records in the expected order, with respect to their datafile */ + private List generateExpectedRecords(boolean ascending) throws Exception { + Table table = CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, SCHEMA_TS); + long baseTime = 1702382109000L; + + GenericAppenderHelper helper = + new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); + + Record file1Record1 = + generateRecord(Instant.ofEpochMilli(baseTime), baseTime + (1000 * 60 * 60 * 24 * 30L)); + Record file1Record2 = + generateRecord( + Instant.ofEpochMilli(baseTime - 10 * 1000L), baseTime + (1000 * 60 * 60 * 24 * 35L)); + + List recordsDataFile1 = Lists.newArrayList(); + recordsDataFile1.add(file1Record1); + recordsDataFile1.add(file1Record2); + DataFile dataFile1 = helper.writeFile(recordsDataFile1); + + Record file2Record1 = + generateRecord( + Instant.ofEpochMilli(baseTime + 14 * 1000L), baseTime - (1000 * 60 * 60 * 24 * 30L)); + Record file2Record2 = + generateRecord( + Instant.ofEpochMilli(baseTime + 12 * 1000L), baseTime - (1000 * 60 * 61 * 24 * 35L)); + + List recordsDataFile2 = Lists.newArrayList(); + recordsDataFile2.add(file2Record1); + recordsDataFile2.add(file2Record2); + + DataFile dataFile2 = helper.writeFile(recordsDataFile2); + helper.appendToTable(dataFile1, dataFile2); + + // Expected records if the splits are ordered + // - ascending (watermark from t1) - records from the split with early timestamps, then + // records from the split with late timestamps + // - descending (watermark from t2) - records from the split with old longs, then records + // from the split with new longs + List expected = Lists.newArrayList(); + if (ascending) { + expected.addAll(recordsDataFile1); + expected.addAll(recordsDataFile2); + } else { + expected.addAll(recordsDataFile2); + expected.addAll(recordsDataFile1); + } + return expected; + } + + /** Tests the order of splits returned when setting the watermark-column options */ + @Test + public void testWatermarkOptionsAscending() throws Exception { + List expected = generateExpectedRecords(true); + TestHelpers.assertRecordsWithOrder( + run( + ImmutableMap.of("watermark-column", "t1", "split-file-open-cost", "128000000"), + "", + "*"), + expected, + SCHEMA_TS); + } + + /** + * Tests the order of splits returned when setting the watermark-column and + * watermark-column-time-unit" options + */ + @Test + public void testWatermarkOptionsDescending() throws Exception { + List expected = generateExpectedRecords(false); + TestHelpers.assertRecordsWithOrder( + run( + ImmutableMap.of( + "watermark-column", + "t2", + "watermark-column-time-unit", + "MILLISECONDS", + "split-file-open-cost", + "128000000"), + "", + "*"), + expected, + SCHEMA_TS); + } + + @Test + public void testReadFlinkDynamicTable() throws Exception { + List expected = generateExpectedRecords(false); + SqlHelpers.sql( + getTableEnv(), + "create table `default_catalog`.`default_database`.flink_table LIKE iceberg_catalog.`default`.%s", + TestFixtures.TABLE); + + // Read from table in flink catalog + TestHelpers.assertRecords( + SqlHelpers.sql( + getTableEnv(), "select * from `default_catalog`.`default_database`.flink_table"), + expected, + SCHEMA_TS); + } + + @Test + public void testWatermarkInvalidConfig() { + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, SCHEMA_TS); + + String flinkTable = "`default_catalog`.`default_database`.flink_table"; + SqlHelpers.sql( + getStreamingTableEnv(), + "CREATE TABLE %s " + + "(eventTS AS CAST(t1 AS TIMESTAMP(3)), " + + "WATERMARK FOR eventTS AS SOURCE_WATERMARK()) LIKE iceberg_catalog.`default`.%s", + flinkTable, + TestFixtures.TABLE); + + assertThatThrownBy(() -> SqlHelpers.sql(getStreamingTableEnv(), "SELECT * FROM %s", flinkTable)) + .isInstanceOf(NullPointerException.class) + .hasMessage("watermark-column needs to be configured to use source watermark."); + } + + @Test + public void testWatermarkValidConfig() throws Exception { + List expected = generateExpectedRecords(true); + + String flinkTable = "`default_catalog`.`default_database`.flink_table"; + + SqlHelpers.sql( + getStreamingTableEnv(), + "CREATE TABLE %s " + + "(eventTS AS CAST(t1 AS TIMESTAMP(3)), " + + "WATERMARK FOR eventTS AS SOURCE_WATERMARK()) WITH ('watermark-column'='t1') LIKE iceberg_catalog.`default`.%s", + flinkTable, + TestFixtures.TABLE); + + TestHelpers.assertRecordsWithOrder( + SqlHelpers.sql( + getStreamingTableEnv(), + "SELECT t1, t2 FROM TABLE(TUMBLE(TABLE %s, DESCRIPTOR(eventTS), INTERVAL '1' SECOND))", + flinkTable), + expected, + SCHEMA_TS); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java new file mode 100644 index 000000000000..f84cf7fb1aae --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java @@ -0,0 +1,408 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; + +import java.io.Serializable; +import java.nio.file.Path; +import java.time.Duration; +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.temporal.ChronoUnit; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.flink.api.common.JobID; +import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.metrics.Gauge; +import org.apache.flink.runtime.metrics.MetricNames; +import org.apache.flink.runtime.minicluster.MiniCluster; +import org.apache.flink.runtime.minicluster.RpcServiceSharing; +import org.apache.flink.runtime.testutils.CommonTestUtils; +import org.apache.flink.runtime.testutils.InMemoryReporter; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.windowing.AllWindowFunction; +import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows; +import org.apache.flink.streaming.api.windowing.windows.TimeWindow; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.test.junit5.InjectMiniCluster; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.util.CloseableIterator; +import org.apache.flink.util.Collector; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.HadoopTableExtension; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +public class TestIcebergSourceWithWatermarkExtractor implements Serializable { + private static final int PARALLELISM = 4; + private static final String SOURCE_NAME = "IcebergSource"; + private static final int RECORD_NUM_FOR_2_SPLITS = 200; + private static final ConcurrentMap WINDOWS = Maps.newConcurrentMap(); + + @TempDir protected Path temporaryFolder; + + private static final InMemoryReporter REPORTER = InMemoryReporter.createWithRetainedMetrics(); + + @RegisterExtension + public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = + new MiniClusterExtension( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(1) + .setNumberSlotsPerTaskManager(PARALLELISM) + .setRpcServiceSharing(RpcServiceSharing.DEDICATED) + .setConfiguration(REPORTER.addToConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG)) + .withHaLeadershipControl() + .build()); + + @RegisterExtension + private static final HadoopTableExtension TABLE_EXTENSION = + new HadoopTableExtension(TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.TS_SCHEMA); + + /** + * This is an integration test for watermark handling and windowing. Integration testing the + * following features: + * + *
      + *
    • - Ordering of the splits + *
    • - Emitting of watermarks + *
    • - Firing windows based on watermarks + *
    + * + *

    The test generates 4 splits + * + *

      + *
    • - Split 1 - Watermark 100 min + *
    • - Split 2, 3 - Watermark 0 min + *
    • - Split 4 - Watermark 6 min + *
    + * + *

    Creates a source with 5 minutes tumbling window with parallelism 1 (to prevent concurrency + * issues). + * + *

    Checks that windows are handled correctly based on the emitted watermarks, and splits are + * read in the following order: + * + *

      + *
    • - Split 2, 3 + *
    • - Split 4 + *
    • - Split 1 + *
    + * + *

    As a result the window aggregator emits the records based on in Split 2-3, and Split 4 data. + * + *

    Add 2 more splits, so the task manager close the windows for the original 4 splits and emit + * the appropriate aggregated records. + */ + @Test + public void testWindowing() throws Exception { + GenericAppenderHelper dataAppender = appender(); + List expectedRecords = Lists.newArrayList(); + + // Generate records with the following pattern: + // - File 1 - Later records (Watermark 6000000) + // - Split 1 - 2 records (100, "file_1-recordTs_100"), (103, "file_1-recordTs_103") + // - File 2 - First records (Watermark 0) + // - Split 1 - 100 records (0, "file_2-recordTs_0"), (1, "file_2-recordTs_1"),... + // - Split 2 - 100 records (0, "file_2-recordTs_0"), (1, "file_2-recordTs_1"),... + // - File 3 - Parallel write for the first records (Watermark 360000) + // - Split 1 - 2 records (6, "file_3-recordTs_6"), (7, "file_3-recordTs_7") + List batch = + ImmutableList.of( + generateRecord(100, "file_1-recordTs_100"), + generateRecord(101, "file_1-recordTs_101"), + generateRecord(103, "file_1-recordTs_103")); + expectedRecords.addAll(batch); + dataAppender.appendToTable(batch); + + batch = Lists.newArrayListWithCapacity(100); + for (int i = 0; i < RECORD_NUM_FOR_2_SPLITS; ++i) { + // Generate records where the timestamps are out of order, but still between 0-5 minutes + batch.add(generateRecord(4 - i % 5, "file_2-recordTs_" + i)); + } + expectedRecords.addAll(batch); + dataAppender.appendToTable(batch); + + batch = + ImmutableList.of( + generateRecord(6, "file_3-recordTs_6"), generateRecord(7, "file_3-recordTs_7")); + expectedRecords.addAll(batch); + dataAppender.appendToTable(batch); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(1); + + DataStream stream = + env.fromSource( + source(), + WatermarkStrategy.noWatermarks() + .withTimestampAssigner(new RowDataTimestampAssigner()), + SOURCE_NAME, + TypeInformation.of(RowData.class)); + + stream + .windowAll(TumblingEventTimeWindows.of(Duration.ofMinutes(5))) + .apply( + new AllWindowFunction() { + @Override + public void apply( + TimeWindow window, Iterable values, Collector out) { + // Emit RowData which contains the window start time, and the record count in + // that window + AtomicInteger count = new AtomicInteger(0); + values.forEach(a -> count.incrementAndGet()); + out.collect(row(window.getStart(), count.get())); + WINDOWS.put(window.getStart(), count.get()); + } + }); + + // Use static variable to collect the windows, since other solutions were flaky + WINDOWS.clear(); + env.executeAsync("Iceberg Source Windowing Test"); + + // Wait for the 2 first windows from File 2 and File 3 + Awaitility.await() + .pollInterval(Duration.ofMillis(10)) + .atMost(30, TimeUnit.SECONDS) + .until( + () -> + WINDOWS.equals( + ImmutableMap.of(0L, RECORD_NUM_FOR_2_SPLITS, TimeUnit.MINUTES.toMillis(5), 2))); + + // Write data so the windows containing test data are closed + dataAppender.appendToTable( + dataAppender.writeFile(ImmutableList.of(generateRecord(1500, "last-record")))); + + // Wait for last test record window from File 1 + Awaitility.await() + .pollInterval(Duration.ofMillis(10)) + .atMost(30, TimeUnit.SECONDS) + .until( + () -> + WINDOWS.equals( + ImmutableMap.of( + 0L, + RECORD_NUM_FOR_2_SPLITS, + TimeUnit.MINUTES.toMillis(5), + 2, + TimeUnit.MINUTES.toMillis(100), + 3))); + } + + /** + * This is an integration test for watermark handling and throttling. Integration testing the + * following: + * + *

      + *
    • - Emitting of watermarks + *
    • - Watermark alignment + *
    + * + *

    The test generates 3 splits + * + *

      + *
    • - Split 1 - Watermark 100 min + *
    • - Split 2, 3 - Watermark 0 min + *
    + * + * The splits are read in the following order: + * + *
      + *
    • - Split 2, 3 (Task Manager 1, Task Manager 2) + *
    • - Split 1 (Task Manager 1 or ask Manager 2 depending on scheduling) + *
    + * + * Reading split 1 will cause the watermark alignment to pause reading for the given task manager. + * + *

    The status of the watermark alignment is checked by the alignment related metrics. + * + *

    Adding new records with old timestamps to the table will enable the running reader to + * continue reading the files, but the watermark alignment will still prevent the paused reader to + * continue. + * + *

    After adding some records with new timestamps the blocked reader is un-paused, and both ot + * the readers continue reading. + */ + @Test + public void testThrottling(@InjectMiniCluster MiniCluster miniCluster) throws Exception { + GenericAppenderHelper dataAppender = appender(); + + // Generate records in advance + + // File 1 - Later records (Watermark 6.000.000 - 100 min) + // - Split 1 - 2 records (100, "file_1-recordTs_100"), (103, "file_1-recordTs_103") + List batch1 = + ImmutableList.of( + generateRecord(100, "file_1-recordTs_100"), generateRecord(103, "file_1-recordTs_103")); + + // File 2 - First records (Watermark 0 - 0 min) + // - Split 1 - 100 records (0, "file_2-recordTs_0"), (1, "file_2-recordTs_1"),... + // - Split 2 - 100 records (0, "file_2-recordTs_0"), (1, "file_2-recordTs_1"),... + List batch2 = Lists.newArrayListWithCapacity(100); + for (int i = 0; i < RECORD_NUM_FOR_2_SPLITS; ++i) { + batch2.add(generateRecord(4 - i % 5, "file_2-recordTs_" + i)); + } + + // File 3 - Some records will be blocked (Watermark 900.000 - 15 min) + List batch3 = + ImmutableList.of( + generateRecord(15, "file_3-recordTs_15"), + generateRecord(16, "file_3-recordTs_16"), + generateRecord(17, "file_3-recordTs_17")); + + // File 4 - Some records will be blocked (Watermark 900.000 - 15 min) + List batch4 = + ImmutableList.of( + generateRecord(15, "file_4-recordTs_15"), + generateRecord(16, "file_4-recordTs_16"), + generateRecord(17, "file_4-recordTs_17")); + + // File 5 - Records which will remove the block (Watermark 5.400.000 - 90 min) + List batch5 = + ImmutableList.of( + generateRecord(90, "file_5-recordTs_90"), generateRecord(91, "file_5-recordTs_91")); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(2); + + DataStream stream = + env.fromSource( + source(), + WatermarkStrategy.noWatermarks() + .withWatermarkAlignment("iceberg", Duration.ofMinutes(20), Duration.ofMillis(10)), + SOURCE_NAME, + TypeInformation.of(RowData.class)); + + try (CloseableIterator resultIterator = stream.collectAsync()) { + JobClient jobClient = env.executeAsync("Iceberg Source Throttling Test"); + CommonTestUtils.waitForAllTaskRunning(miniCluster, jobClient.getJobID(), false); + + // Insert the first data into the table + dataAppender.appendToTable(dataAppender.writeFile(batch1), dataAppender.writeFile(batch2)); + + // Get the drift metric, wait for it to be created and reach the expected state + // (100 min - 20 min - 0 min) + // Also this validates that the WatermarkAlignment is working + Awaitility.await() + .pollInterval(Duration.ofMillis(10)) + .atMost(30, TimeUnit.SECONDS) + .until( + () -> + findAlignmentDriftMetric(jobClient.getJobID(), TimeUnit.MINUTES.toMillis(80)) + .isPresent()); + Gauge drift = + findAlignmentDriftMetric(jobClient.getJobID(), TimeUnit.MINUTES.toMillis(80)).get(); + + // Add some old records with 2 splits, so even if the blocked gets one split, the other reader + // one gets one as well + dataAppender.appendToTable(dataAppender.writeFile(batch3), dataAppender.writeFile(batch4)); + + // Get the drift metric, wait for it to be created and reach the expected state (100 min - 20 + // min - 15 min) + Awaitility.await() + .pollInterval(Duration.ofMillis(10)) + .atMost(30, TimeUnit.SECONDS) + .until(() -> drift.getValue() == TimeUnit.MINUTES.toMillis(65)); + + // Add some new records which should unblock the throttled reader + dataAppender.appendToTable(batch5); + + // Wait for the new drift to decrease below the allowed drift to signal the normal state + Awaitility.await() + .pollInterval(Duration.ofMillis(10)) + .atMost(30, TimeUnit.SECONDS) + .until(() -> drift.getValue() < TimeUnit.MINUTES.toMillis(20)); + } + } + + protected IcebergSource source() { + return IcebergSource.builder() + .tableLoader(TABLE_EXTENSION.tableLoader()) + .watermarkColumn("ts") + .project(TestFixtures.TS_SCHEMA) + .splitSize(100L) + .streaming(true) + .monitorInterval(Duration.ofMillis(10)) + .streamingStartingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + } + + protected Record generateRecord(int minutes, String str) { + // Override the ts field to create a more realistic situation for event time alignment + Record record = GenericRecord.create(TestFixtures.TS_SCHEMA); + LocalDateTime ts = + LocalDateTime.ofInstant( + Instant.ofEpochMilli(Duration.of(minutes, ChronoUnit.MINUTES).toMillis()), + ZoneId.of("Z")); + record.setField("ts", ts); + record.setField("str", str); + return record; + } + + private Optional> findAlignmentDriftMetric(JobID jobID, long withValue) { + String metricsName = SOURCE_NAME + ".*" + MetricNames.WATERMARK_ALIGNMENT_DRIFT; + return REPORTER.findMetrics(jobID, metricsName).values().stream() + .map(m -> (Gauge) m) + .filter(m -> m.getValue() == withValue) + .findFirst(); + } + + private GenericAppenderHelper appender() { + // We need to create multiple splits, so we need to generate parquet files with multiple offsets + org.apache.hadoop.conf.Configuration hadoopConf = new org.apache.hadoop.conf.Configuration(); + hadoopConf.set("write.parquet.page-size-bytes", "64"); + hadoopConf.set("write.parquet.row-group-size-bytes", "64"); + return new GenericAppenderHelper( + TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder, hadoopConf); + } + + private static RowData row(long time, long count) { + GenericRowData result = new GenericRowData(2); + result.setField(0, time); + result.setField(1, String.valueOf(count)); + return result; + } + + private static class RowDataTimestampAssigner implements SerializableTimestampAssigner { + @Override + public long extractTimestamp(RowData element, long recordTimestamp) { + return element.getTimestamp(0, 0).getMillisecond(); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java new file mode 100644 index 000000000000..61a587e7786a --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Files; +import java.time.Duration; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.api.common.RuntimeExecutionMode; +import org.apache.flink.api.common.TaskInfo; +import org.apache.flink.api.common.functions.RichMapFunction; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.configuration.BatchExecutionOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.configuration.JobManagerOptions; +import org.apache.flink.configuration.RestOptions; +import org.apache.flink.configuration.SlowTaskDetectorOptions; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.TestBase; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; + +/** + * There is a infinite sleep in the test. Add a timeout to the test to avoid stuck situation in case + * anything goes wrong unexpectedly. + */ +@Timeout(value = 60) +@ExtendWith(ParameterizedTestExtension.class) +public class TestIcebergSpeculativeExecutionSupport extends TestBase { + private static final int NUM_TASK_MANAGERS = 1; + private static final int NUM_TASK_SLOTS = 3; + + @RegisterExtension + public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = + new MiniClusterExtension( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(NUM_TASK_MANAGERS) + .setNumberSlotsPerTaskManager(NUM_TASK_SLOTS) + .setConfiguration(configure()) + .build()); + + private StreamTableEnvironment tEnv; + private static final String CATALOG_NAME = "test_catalog"; + private static final String DATABASE_NAME = "test_db"; + private static final String INPUT_TABLE_NAME = "test_table"; + private static final String OUTPUT_TABLE_NAME = "sink_table"; + + @Override + protected TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment(configure()); + env.setRuntimeMode(RuntimeExecutionMode.BATCH); + tEnv = StreamTableEnvironment.create(env); + } + } + + return tEnv; + } + + @Parameter(index = 0) + private boolean useV2Sink; + + @Parameters(name = "useV2Sink = {0}") + public static Object[][] parameters() { + return new Object[][] {{true}, {false}}; + } + + @BeforeEach + public void before() throws IOException { + String warehouse = + String.format("file:%s", Files.createTempDirectory(temporaryDirectory, "junit").toString()); + sql( + "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + CATALOG_NAME, warehouse); + sql("USE CATALOG %s", CATALOG_NAME); + sql("CREATE DATABASE %s", DATABASE_NAME); + sql("USE %s", DATABASE_NAME); + + sql("CREATE TABLE %s (i INT, j INT)", INPUT_TABLE_NAME); + sql("INSERT INTO %s VALUES (1, -1),(2, -1),(3, -1)", INPUT_TABLE_NAME); + sql("CREATE TABLE %s (i INT, j INT, subTask INT, attempt INT)", OUTPUT_TABLE_NAME); + } + + @AfterEach + public void after() { + sql("DROP TABLE IF EXISTS %s.%s", DATABASE_NAME, INPUT_TABLE_NAME); + sql("DROP TABLE IF EXISTS %s.%s", DATABASE_NAME, OUTPUT_TABLE_NAME); + dropDatabase(DATABASE_NAME, true); + dropCatalog(CATALOG_NAME, true); + } + + @TestTemplate + public void testSpeculativeExecution() throws Exception { + tEnv.getConfig().set("table.exec.iceberg.use-v2-sink", String.valueOf(useV2Sink)); + Table table = + tEnv.sqlQuery(String.format("SELECT * FROM %s.%s", DATABASE_NAME, INPUT_TABLE_NAME)); + DataStream slowStream = + tEnv.toDataStream(table, Row.class) + .map(new TestingMap()) + .name("test_map") + .returns( + Types.ROW_NAMED( + new String[] {"i", "j", "subTask", "attempt"}, + Types.INT, + Types.INT, + Types.INT, + Types.INT)) + .setParallelism(NUM_TASK_SLOTS); + + tEnv.fromDataStream(slowStream) + .executeInsert(String.format("%s.%s", DATABASE_NAME, OUTPUT_TABLE_NAME)) + .await(); + + List output = sql(String.format("SELECT * FROM %s.%s", DATABASE_NAME, OUTPUT_TABLE_NAME)); + + // Ensure that all subTasks has attemptNum > 0 + assertThat(output.stream().map(x -> x.getField(3)).collect(Collectors.toSet())).contains(1); + + // Ensure the test_table rows are returned exactly the same after the slow map task from the + // sink_table + assertSameElements( + output.stream().map(x -> Row.of(x.getField(0), x.getField(1))).collect(Collectors.toList()), + Arrays.asList(Row.of(1, -1), Row.of(2, -1), Row.of(3, -1))); + } + + /** A testing map function that simulates the slow task. */ + private static class TestingMap extends RichMapFunction { + @Override + public Row map(Row row) throws Exception { + // Simulate slow subtask 0 with attempt 0 + TaskInfo taskInfo = getRuntimeContext().getTaskInfo(); + if (taskInfo.getIndexOfThisSubtask() == 0 && taskInfo.getAttemptNumber() <= 0) { + Thread.sleep(Integer.MAX_VALUE); + } + + Row output = + Row.of( + row.getField(0), + row.getField(1), + getRuntimeContext().getTaskInfo().getIndexOfThisSubtask(), + getRuntimeContext().getTaskInfo().getAttemptNumber()); + + return output; + } + } + + private static Configuration configure() { + Configuration configuration = new Configuration(); + configuration.set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); + configuration.set(RestOptions.BIND_PORT, "0"); + configuration.set(JobManagerOptions.SLOT_REQUEST_TIMEOUT, Duration.ofSeconds(5)); + + // Use FLIP-27 source + configuration.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE, true); + configuration.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); + + // for speculative execution + configuration.set(BatchExecutionOptions.SPECULATIVE_ENABLED, true); + + configuration.set(SlowTaskDetectorOptions.EXECUTION_TIME_BASELINE_MULTIPLIER, 1.0); + configuration.set(SlowTaskDetectorOptions.EXECUTION_TIME_BASELINE_RATIO, 0.2); + configuration.set( + SlowTaskDetectorOptions.EXECUTION_TIME_BASELINE_LOWER_BOUND, Duration.ofMillis(0)); + configuration.set(BatchExecutionOptions.BLOCK_SLOW_NODE_DURATION, Duration.ofMillis(0)); + + return configuration; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java new file mode 100644 index 000000000000..488969bab045 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java @@ -0,0 +1,364 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + +import java.io.File; +import java.io.IOException; +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.nio.file.Path; +import java.util.Base64; +import java.util.List; +import java.util.Map; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.types.Row; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.Files; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.FileHelpers; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.CatalogTestBase; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.io.TempDir; + +public class TestMetadataTableReadableMetrics extends CatalogTestBase { + private static final String TABLE_NAME = "test_table"; + + @Parameters(name = "catalogName={0}, baseNamespace={1}") + protected static List parameters() { + List parameters = Lists.newArrayList(); + String catalogName = "testhive"; + Namespace baseNamespace = Namespace.empty(); + parameters.add(new Object[] {catalogName, baseNamespace}); + return parameters; + } + + @Override + protected TableEnvironment getTableEnv() { + Configuration configuration = super.getTableEnv().getConfig().getConfiguration(); + configuration.set(CoreOptions.DEFAULT_PARALLELISM, 1); + return super.getTableEnv(); + } + + private @TempDir Path temp; + + private static final Types.StructType LEAF_STRUCT_TYPE = + Types.StructType.of( + optional(1, "leafLongCol", Types.LongType.get()), + optional(2, "leafDoubleCol", Types.DoubleType.get())); + + private static final Types.StructType NESTED_STRUCT_TYPE = + Types.StructType.of(required(3, "leafStructCol", LEAF_STRUCT_TYPE)); + + private static final Schema NESTED_SCHEMA = + new Schema(required(4, "nestedStructCol", NESTED_STRUCT_TYPE)); + + private static final Schema PRIMITIVE_SCHEMA = + new Schema( + required(1, "booleanCol", Types.BooleanType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "longCol", Types.LongType.get()), + required(4, "floatCol", Types.FloatType.get()), + required(5, "doubleCol", Types.DoubleType.get()), + optional(6, "decimalCol", Types.DecimalType.of(10, 2)), + optional(7, "stringCol", Types.StringType.get()), + optional(8, "fixedCol", Types.FixedType.ofLength(3)), + optional(9, "binaryCol", Types.BinaryType.get())); + + private Table createPrimitiveTable() throws IOException { + Table table = + catalog.createTable( + TableIdentifier.of(DATABASE, TABLE_NAME), + PRIMITIVE_SCHEMA, + PartitionSpec.unpartitioned(), + ImmutableMap.of()); + List records = + Lists.newArrayList( + createPrimitiveRecord( + false, + 1, + 1L, + 0, + 1.0D, + new BigDecimal("1.00"), + "1", + Base64.getDecoder().decode("1111"), + ByteBuffer.wrap(Base64.getDecoder().decode("1111"))), + createPrimitiveRecord( + true, + 2, + 2L, + 0, + 2.0D, + new BigDecimal("2.00"), + "2", + Base64.getDecoder().decode("2222"), + ByteBuffer.wrap(Base64.getDecoder().decode("2222"))), + createPrimitiveRecord(false, 1, 1, Float.NaN, Double.NaN, null, "1", null, null), + createPrimitiveRecord( + false, 2, 2L, Float.NaN, 2.0D, new BigDecimal("2.00"), "2", null, null)); + + File testFile = File.createTempFile("junit", null, temp.toFile()); + DataFile dataFile = FileHelpers.writeDataFile(table, Files.localOutput(testFile), records); + table.newAppend().appendFile(dataFile).commit(); + return table; + } + + private Table createNestedTable() throws IOException { + Table table = + validationCatalog.createTable( + TableIdentifier.of(DATABASE, TABLE_NAME), + NESTED_SCHEMA, + PartitionSpec.unpartitioned(), + ImmutableMap.of()); + + List records = + Lists.newArrayList( + createNestedRecord(0L, 0.0), + createNestedRecord(1L, Double.NaN), + createNestedRecord(null, null)); + + File testFile = File.createTempFile("junit", null, temp.toFile()); + DataFile dataFile = FileHelpers.writeDataFile(table, Files.localOutput(testFile), records); + table.newAppend().appendFile(dataFile).commit(); + + return table; + } + + @BeforeEach + public void before() { + super.before(); + sql("USE CATALOG %s", catalogName); + sql("CREATE DATABASE %s", DATABASE); + sql("USE %s", DATABASE); + } + + @Override + @AfterEach + public void clean() { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME); + dropDatabase(flinkDatabase, true); + super.clean(); + } + + protected GenericRecord createPrimitiveRecord( + boolean booleanCol, + int intCol, + long longCol, + float floatCol, + double doubleCol, + BigDecimal decimalCol, + String stringCol, + byte[] fixedCol, + ByteBuffer binaryCol) { + GenericRecord record = GenericRecord.create(PRIMITIVE_SCHEMA); + record.set(0, booleanCol); + record.set(1, intCol); + record.set(2, longCol); + record.set(3, floatCol); + record.set(4, doubleCol); + record.set(5, decimalCol); + record.set(6, stringCol); + record.set(7, fixedCol); + record.set(8, binaryCol); + return record; + } + + private GenericRecord createNestedRecord(Long longCol, Double doubleCol) { + GenericRecord record = GenericRecord.create(NESTED_SCHEMA); + GenericRecord nested = GenericRecord.create(NESTED_STRUCT_TYPE); + GenericRecord leaf = GenericRecord.create(LEAF_STRUCT_TYPE); + leaf.set(0, longCol); + leaf.set(1, doubleCol); + nested.set(0, leaf); + record.set(0, nested); + return record; + } + + protected Object[] row(Object... values) { + return values; + } + + @TestTemplate + public void testPrimitiveColumns() throws Exception { + Table table = createPrimitiveTable(); + List result = sql("SELECT readable_metrics FROM %s$files", TABLE_NAME); + + // With new releases of Parquet, new features might be added which cause the + // size of the column to increase. For example, with Parquet 1.14.x the + // uncompressed size has been added to allow for better allocation of memory upfront. + // Therefore, we look the sizes up, rather than hardcoding them + DataFile dataFile = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); + Map columnSizeStats = dataFile.columnSizes(); + + Row binaryCol = + Row.of( + columnSizeStats.get(PRIMITIVE_SCHEMA.findField("binaryCol").fieldId()), + 4L, + 2L, + null, + Base64.getDecoder().decode("1111"), + Base64.getDecoder().decode("2222")); + Row booleanCol = + Row.of( + columnSizeStats.get(PRIMITIVE_SCHEMA.findField("booleanCol").fieldId()), + 4L, + 0L, + null, + false, + true); + Row decimalCol = + Row.of( + columnSizeStats.get(PRIMITIVE_SCHEMA.findField("decimalCol").fieldId()), + 4L, + 1L, + null, + new BigDecimal("1.00"), + new BigDecimal("2.00")); + Row doubleCol = + Row.of( + columnSizeStats.get(PRIMITIVE_SCHEMA.findField("doubleCol").fieldId()), + 4L, + 0L, + 1L, + 1.0D, + 2.0D); + Row fixedCol = + Row.of( + columnSizeStats.get(PRIMITIVE_SCHEMA.findField("fixedCol").fieldId()), + 4L, + 2L, + null, + Base64.getDecoder().decode("1111"), + Base64.getDecoder().decode("2222")); + Row floatCol = + Row.of( + columnSizeStats.get(PRIMITIVE_SCHEMA.findField("floatCol").fieldId()), + 4L, + 0L, + 2L, + 0f, + 0f); + Row intCol = + Row.of( + columnSizeStats.get(PRIMITIVE_SCHEMA.findField("intCol").fieldId()), + 4L, + 0L, + null, + 1, + 2); + Row longCol = + Row.of( + columnSizeStats.get(PRIMITIVE_SCHEMA.findField("longCol").fieldId()), + 4L, + 0L, + null, + 1L, + 2L); + Row stringCol = + Row.of( + columnSizeStats.get(PRIMITIVE_SCHEMA.findField("stringCol").fieldId()), + 4L, + 0L, + null, + "1", + "2"); + + List expected = + Lists.newArrayList( + Row.of( + Row.of( + binaryCol, + booleanCol, + decimalCol, + doubleCol, + fixedCol, + floatCol, + intCol, + longCol, + stringCol))); + TestHelpers.assertRows(result, expected); + } + + @TestTemplate + public void testSelectPrimitiveValues() throws Exception { + createPrimitiveTable(); + + TestHelpers.assertRows( + sql( + "SELECT readable_metrics.intCol.lower_bound, readable_metrics.booleanCol.upper_bound FROM %s$files", + TABLE_NAME), + ImmutableList.of(Row.of(1, true))); + + TestHelpers.assertRows( + sql("SELECT content, readable_metrics.longCol.value_count FROM %s$files", TABLE_NAME), + ImmutableList.of(Row.of(0, 4L))); + + TestHelpers.assertRows( + sql("SELECT readable_metrics.longCol.value_count, content FROM %s$files", TABLE_NAME), + ImmutableList.of(Row.of(4L, 0))); + } + + @TestTemplate + public void testSelectNestedValues() throws Exception { + createNestedTable(); + TestHelpers.assertRows( + sql( + "SELECT readable_metrics.`nestedStructCol.leafStructCol.leafLongCol`.lower_bound, " + + "readable_metrics.`nestedStructCol.leafStructCol.leafDoubleCol`.value_count FROM %s$files", + TABLE_NAME), + ImmutableList.of(Row.of(0L, 3L))); + } + + @TestTemplate + public void testNestedValues() throws Exception { + createNestedTable(); + List result = sql("SELECT readable_metrics FROM %s$files", TABLE_NAME); + + // We have to take a slightly different approach, since we don't store + // the column sizes for nested fields. + long leafDoubleColSize = + (long) ((Row) ((Row) result.get(0).getField(0)).getField(0)).getField(0); + long leafLongColSize = (long) ((Row) ((Row) result.get(0).getField(0)).getField(1)).getField(0); + + Row leafDoubleCol = Row.of(leafDoubleColSize, 3L, 1L, 1L, 0.0D, 0.0D); + Row leafLongCol = Row.of(leafLongColSize, 3L, 1L, null, 0L, 1L); + Row metrics = Row.of(Row.of(leafDoubleCol, leafLongCol)); + + TestHelpers.assertRows(result, ImmutableList.of(metrics)); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java new file mode 100644 index 000000000000..ef8380c21613 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.Set; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.RowDelta; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.flink.data.RowDataProjection; +import org.apache.iceberg.flink.sink.RowDataTaskWriterFactory; +import org.apache.iceberg.flink.sink.TaskWriterFactory; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestProjectMetaColumn { + + @TempDir protected Path temporaryFolder; + + @Parameter(index = 0) + private FileFormat format; + + @Parameters(name = "fileFormat={0}") + public static Iterable parameters() { + return Lists.newArrayList( + new Object[] {FileFormat.PARQUET}, + new Object[] {FileFormat.ORC}, + new Object[] {FileFormat.AVRO}); + } + + private void testSkipToRemoveMetaColumn(int formatVersion) throws IOException { + // Create the table with given format version. + String location = Files.createTempDirectory(temporaryFolder, "junit").toFile().toString(); + Table table = + SimpleDataUtil.createTable( + location, + ImmutableMap.of(TableProperties.FORMAT_VERSION, String.valueOf(formatVersion)), + false); + + List rows = + Lists.newArrayList( + SimpleDataUtil.createInsert(1, "AAA"), + SimpleDataUtil.createInsert(2, "BBB"), + SimpleDataUtil.createInsert(3, "CCC")); + writeAndCommit(table, ImmutableSet.of(), false, rows); + + FlinkInputFormat input = + FlinkSource.forRowData().tableLoader(TableLoader.fromHadoopTable(location)).buildFormat(); + + List results = Lists.newArrayList(); + TestHelpers.readRowData( + input, + rowData -> { + // If project to remove the meta columns, it will get a RowDataProjection. + assertThat(rowData).isInstanceOf(GenericRowData.class); + results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); + }); + + // Assert the results. + TestHelpers.assertRows(rows, results, SimpleDataUtil.ROW_TYPE); + } + + @TestTemplate + public void testV1SkipToRemoveMetaColumn() throws IOException { + testSkipToRemoveMetaColumn(1); + } + + @TestTemplate + public void testV2SkipToRemoveMetaColumn() throws IOException { + testSkipToRemoveMetaColumn(2); + } + + @TestTemplate + public void testV2RemoveMetaColumn() throws Exception { + // Create the v2 table. + String location = Files.createTempDirectory(temporaryFolder, "junit").toFile().toString(); + Table table = + SimpleDataUtil.createTable( + location, ImmutableMap.of(TableProperties.FORMAT_VERSION, "2"), false); + + List rows = + Lists.newArrayList( + SimpleDataUtil.createInsert(1, "AAA"), + SimpleDataUtil.createDelete(1, "AAA"), + SimpleDataUtil.createInsert(2, "AAA"), + SimpleDataUtil.createInsert(2, "BBB")); + int eqFieldId = table.schema().findField("data").fieldId(); + writeAndCommit(table, ImmutableSet.of(eqFieldId), true, rows); + + FlinkInputFormat input = + FlinkSource.forRowData().tableLoader(TableLoader.fromHadoopTable(location)).buildFormat(); + + List results = Lists.newArrayList(); + TestHelpers.readRowData( + input, + rowData -> { + // If project to remove the meta columns, it will get a RowDataProjection. + assertThat(rowData).isInstanceOf(RowDataProjection.class); + results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); + }); + + // Assert the results. + TestHelpers.assertRows( + ImmutableList.of( + SimpleDataUtil.createInsert(2, "AAA"), SimpleDataUtil.createInsert(2, "BBB")), + results, + SimpleDataUtil.ROW_TYPE); + } + + private void writeAndCommit( + Table table, Set eqFieldIds, boolean upsert, List rows) throws IOException { + TaskWriter writer = createTaskWriter(table, eqFieldIds, upsert); + try (TaskWriter io = writer) { + for (RowData row : rows) { + io.write(row); + } + } + + RowDelta delta = table.newRowDelta(); + WriteResult result = writer.complete(); + + for (DataFile dataFile : result.dataFiles()) { + delta.addRows(dataFile); + } + + for (DeleteFile deleteFile : result.deleteFiles()) { + delta.addDeletes(deleteFile); + } + + delta.commit(); + } + + private TaskWriter createTaskWriter( + Table table, Set equalityFieldIds, boolean upsert) { + TaskWriterFactory taskWriterFactory = + new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), + SimpleDataUtil.ROW_TYPE, + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT, + format, + table.properties(), + equalityFieldIds, + upsert); + + taskWriterFactory.initialize(1, 1); + return taskWriterFactory.create(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java new file mode 100644 index 000000000000..6ef40693827e --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.avro.generic.GenericRecord; +import org.apache.iceberg.flink.AvroGenericRecordConverterBase; +import org.apache.iceberg.flink.DataGenerator; + +public class TestRowDataToAvroGenericRecordConverter extends AvroGenericRecordConverterBase { + @Override + protected void testConverter(DataGenerator dataGenerator) { + RowDataToAvroGenericRecordConverter converter = + RowDataToAvroGenericRecordConverter.fromAvroSchema(dataGenerator.avroSchema()); + GenericRecord expected = dataGenerator.generateAvroGenericRecord(); + GenericRecord actual = converter.apply(dataGenerator.generateFlinkRowData()); + assertThat(actual).isEqualTo(expected); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java new file mode 100644 index 000000000000..5dd7de545e11 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import org.junit.jupiter.api.Test; + +class TestScanContext { + @Test + void testIncrementalFromSnapshotId() { + ScanContext context = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .build(); + assertException( + context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); + + context = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(1L) + .startSnapshotTimestamp(1L) + .build(); + assertException( + context, + "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + } + + @Test + void testIncrementalFromSnapshotTimestamp() { + ScanContext context = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .build(); + assertException( + context, + "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); + + context = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotId(1L) + .startSnapshotTimestamp(1L) + .build(); + assertException( + context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + } + + @Test + void testStreaming() { + ScanContext context = ScanContext.builder().streaming(true).useTag("tag").build(); + assertException(context, "Cannot scan table using ref tag configured for streaming reader"); + + context = ScanContext.builder().streaming(true).useSnapshotId(1L).build(); + assertException(context, "Cannot set snapshot-id option for streaming reader"); + + context = ScanContext.builder().streaming(true).asOfTimestamp(1L).build(); + assertException(context, "Cannot set as-of-timestamp option for streaming reader"); + + context = ScanContext.builder().streaming(true).endSnapshotId(1L).build(); + assertException(context, "Cannot set end-snapshot-id option for streaming reader"); + + context = ScanContext.builder().streaming(true).endTag("tag").build(); + assertException(context, "Cannot set end-tag option for streaming reader"); + } + + @Test + void testStartConflict() { + ScanContext context = ScanContext.builder().startTag("tag").startSnapshotId(1L).build(); + assertException(context, "START_SNAPSHOT_ID and START_TAG cannot both be set."); + } + + @Test + void testEndConflict() { + ScanContext context = ScanContext.builder().endTag("tag").endSnapshotId(1L).build(); + assertException(context, "END_SNAPSHOT_ID and END_TAG cannot both be set."); + } + + @Test + void testMaxAllowedPlanningFailures() { + ScanContext context = ScanContext.builder().maxAllowedPlanningFailures(-2).build(); + assertException( + context, "Cannot set maxAllowedPlanningFailures to a negative number other than -1."); + } + + private void assertException(ScanContext context, String message) { + assertThatThrownBy(() -> context.validate()) + .hasMessage(message) + .isInstanceOf(IllegalArgumentException.class); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java new file mode 100644 index 000000000000..db85f108ab1b --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import org.apache.flink.configuration.Configuration; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.junit.jupiter.api.Test; + +public class TestSourceUtil { + @Test + public void testInferedParallelism() throws IOException { + Configuration configuration = new Configuration(); + // Empty table, infer parallelism should be at least 1 + int parallelism = SourceUtil.inferParallelism(configuration, -1L, () -> 0); + assertThat(parallelism).isEqualTo(1); + + // 2 splits (max infer is the default value 100 , max > splits num), the parallelism is splits + // num : 2 + parallelism = SourceUtil.inferParallelism(configuration, -1L, () -> 2); + assertThat(parallelism).isEqualTo(2); + + // 2 splits and limit is 1 , max infer parallelism is default 100, + // which is greater than splits num and limit, the parallelism is the limit value : 1 + parallelism = SourceUtil.inferParallelism(configuration, 1, () -> 2); + assertThat(parallelism).isEqualTo(1); + + // 2 splits and max infer parallelism is 1 (max < splits num), the parallelism is 1 + configuration.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX, 1); + parallelism = SourceUtil.inferParallelism(configuration, -1L, () -> 2); + assertThat(parallelism).isEqualTo(1); + + // 2 splits, max infer parallelism is 1, limit is 3, the parallelism is max infer parallelism : + // 1 + parallelism = SourceUtil.inferParallelism(configuration, 3, () -> 2); + assertThat(parallelism).isEqualTo(1); + + // 2 splits, infer parallelism is disabled, the parallelism is flink default parallelism 1 + configuration.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); + parallelism = SourceUtil.inferParallelism(configuration, 3, () -> 2); + assertThat(parallelism).isEqualTo(1); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java new file mode 100644 index 000000000000..fe4c32731055 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.apache.iceberg.flink.TestFixtures.DATABASE; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import java.util.Map; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Table; +import org.apache.iceberg.TestHelpers; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +/** Test other more advanced usage of SQL. They don't need to run for every file format. */ +public abstract class TestSqlBase { + @RegisterExtension + public static MiniClusterExtension miniClusterExtension = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + @RegisterExtension + public static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); + + @TempDir protected Path temporaryFolder; + + private volatile TableEnvironment tEnv; + + private volatile TableEnvironment streamingTEnv; + + protected TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + if (tEnv == null) { + this.tEnv = + TableEnvironment.create(EnvironmentSettings.newInstance().inBatchMode().build()); + } + } + } + return tEnv; + } + + protected TableEnvironment getStreamingTableEnv() { + if (streamingTEnv == null) { + synchronized (this) { + if (streamingTEnv == null) { + this.streamingTEnv = + TableEnvironment.create(EnvironmentSettings.newInstance().inStreamingMode().build()); + } + } + } + + return streamingTEnv; + } + + @BeforeEach + public abstract void before() throws IOException; + + @Test + public void testResiduals() throws Exception { + Table table = + CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + + List writeRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); + writeRecords.get(0).set(1, 123L); + writeRecords.get(0).set(2, "2020-03-20"); + writeRecords.get(1).set(1, 456L); + writeRecords.get(1).set(2, "2020-03-20"); + + GenericAppenderHelper helper = + new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); + + List expectedRecords = Lists.newArrayList(); + expectedRecords.add(writeRecords.get(0)); + + DataFile dataFile1 = helper.writeFile(TestHelpers.Row.of("2020-03-20", 0), writeRecords); + DataFile dataFile2 = + helper.writeFile( + TestHelpers.Row.of("2020-03-21", 0), + RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); + helper.appendToTable(dataFile1, dataFile2); + + org.apache.iceberg.flink.TestHelpers.assertRecords( + run(Maps.newHashMap(), "where dt='2020-03-20' and id=123", "*"), + expectedRecords, + TestFixtures.SCHEMA); + } + + @Test + public void testExposeLocality() throws Exception { + Table table = + CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + + TableLoader tableLoader = TableLoader.fromHadoopTable(table.location()); + List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 10, 0L); + expectedRecords.forEach(expectedRecord -> expectedRecord.set(2, "2020-03-20")); + + GenericAppenderHelper helper = + new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); + DataFile dataFile = helper.writeFile(TestHelpers.Row.of("2020-03-20", 0), expectedRecords); + helper.appendToTable(dataFile); + + // test sql api + Configuration tableConf = getTableEnv().getConfig().getConfiguration(); + tableConf.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO, false); + + List results = SqlHelpers.sql(getTableEnv(), "select * from t"); + org.apache.iceberg.flink.TestHelpers.assertRecords( + results, expectedRecords, TestFixtures.SCHEMA); + + // test table api + tableConf.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO, true); + FlinkSource.Builder builder = FlinkSource.forRowData().tableLoader(tableLoader).table(table); + + // When running with CI or local, `localityEnabled` will be false even if this configuration is + // enabled + assertThat(SourceUtil.isLocalityEnabled(table, tableConf, true)) + .as("Expose split locality info should be false.") + .isFalse(); + + results = run(Maps.newHashMap(), "where dt='2020-03-20'", "*"); + org.apache.iceberg.flink.TestHelpers.assertRecords( + results, expectedRecords, TestFixtures.SCHEMA); + } + + protected List run( + Map options, String sqlFilter, String... sqlSelectedFields) { + String select = String.join(",", sqlSelectedFields); + String optionStr = SqlHelpers.sqlOptionsToString(options); + return SqlHelpers.sql(getTableEnv(), "select %s from t %s %s", select, optionStr, sqlFilter); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java new file mode 100644 index 000000000000..2f3e0f78ba10 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java @@ -0,0 +1,490 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.ExplainDetail; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.table.api.config.TableConfigOptions; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.Table; +import org.apache.iceberg.TestHelpers; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.CatalogTestBase; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.Timeout; + +@Timeout(60) +public class TestStreamScanSql extends CatalogTestBase { + private static final String TABLE = "test_table"; + private static final FileFormat FORMAT = FileFormat.PARQUET; + + private volatile int defaultJobParallelism; + + private volatile TableEnvironment tEnv; + + @Override + protected TableEnvironment getTableEnv() { + TableEnvironment tableEnv = tEnv; + if (tableEnv != null) { + return tableEnv; + } + synchronized (this) { + if (tEnv == null) { + EnvironmentSettings.Builder settingsBuilder = + EnvironmentSettings.newInstance().inStreamingMode(); + + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); + env.enableCheckpointing(400); + + StreamTableEnvironment streamTableEnv = + StreamTableEnvironment.create(env, settingsBuilder.build()); + defaultJobParallelism = env.getParallelism(); + streamTableEnv + .getConfig() + .getConfiguration() + .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); + tEnv = streamTableEnv; + } + } + return tEnv; + } + + @Override + @BeforeEach + public void before() { + super.before(); + sql("CREATE DATABASE %s", flinkDatabase); + sql("USE CATALOG %s", catalogName); + sql("USE %s", DATABASE); + } + + @Override + @AfterEach + public void clean() { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE); + dropDatabase(flinkDatabase, true); + super.clean(); + } + + private void insertRows(String partition, Table table, Row... rows) throws IOException { + insertRows(partition, SnapshotRef.MAIN_BRANCH, table, rows); + } + + private void insertRows(String partition, String branch, Table table, Row... rows) + throws IOException { + GenericAppenderHelper appender = new GenericAppenderHelper(table, FORMAT, temporaryDirectory); + + GenericRecord gRecord = GenericRecord.create(table.schema()); + List records = Lists.newArrayList(); + for (Row row : rows) { + records.add( + gRecord.copy( + "id", row.getField(0), + "data", row.getField(1), + "dt", row.getField(2))); + } + + if (partition != null) { + appender.appendToTable(TestHelpers.Row.of(partition, 0), branch, records); + } else { + appender.appendToTable(branch, records); + } + } + + private void insertRowsInBranch(String branch, Table table, Row... rows) throws IOException { + insertRows(null, branch, table, rows); + } + + private void insertRows(Table table, Row... rows) throws IOException { + insertRows(null, table, rows); + } + + private void assertRows(List expectedRows, Iterator iterator) { + for (Row expectedRow : expectedRows) { + assertThat(iterator).hasNext(); + Row actualRow = iterator.next(); + assertThat(actualRow.getArity()).isEqualTo(3); + assertThat(actualRow.getField(0)).isEqualTo(expectedRow.getField(0)); + assertThat(actualRow.getField(1)).isEqualTo(expectedRow.getField(1)); + assertThat(actualRow.getField(2)).isEqualTo(expectedRow.getField(2)); + } + } + + @TestTemplate + public void testUnPartitionedTable() throws Exception { + sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); + + TableResult result = + exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); + try (CloseableIterator iterator = result.collect()) { + + Row row1 = Row.of(1, "aaa", "2021-01-01"); + insertRows(table, row1); + assertRows(ImmutableList.of(row1), iterator); + + Row row2 = Row.of(2, "bbb", "2021-01-01"); + insertRows(table, row2); + assertRows(ImmutableList.of(row2), iterator); + } + result.getJobClient().ifPresent(JobClient::cancel); + } + + @TestTemplate + public void testPartitionedTable() throws Exception { + sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR) PARTITIONED BY (dt)", TABLE); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); + + TableResult result = + exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); + try (CloseableIterator iterator = result.collect()) { + Row row1 = Row.of(1, "aaa", "2021-01-01"); + insertRows("2021-01-01", table, row1); + assertRows(ImmutableList.of(row1), iterator); + + Row row2 = Row.of(2, "bbb", "2021-01-02"); + insertRows("2021-01-02", table, row2); + assertRows(ImmutableList.of(row2), iterator); + + Row row3 = Row.of(1, "aaa", "2021-01-02"); + insertRows("2021-01-02", table, row3); + assertRows(ImmutableList.of(row3), iterator); + + Row row4 = Row.of(2, "bbb", "2021-01-01"); + insertRows("2021-01-01", table, row4); + assertRows(ImmutableList.of(row4), iterator); + } + result.getJobClient().ifPresent(JobClient::cancel); + } + + @TestTemplate + public void testConsumeFromBeginning() throws Exception { + sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); + + Row row1 = Row.of(1, "aaa", "2021-01-01"); + Row row2 = Row.of(2, "bbb", "2021-01-01"); + insertRows(table, row1, row2); + + TableResult result = + exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); + try (CloseableIterator iterator = result.collect()) { + assertRows(ImmutableList.of(row1, row2), iterator); + + Row row3 = Row.of(3, "ccc", "2021-01-01"); + insertRows(table, row3); + assertRows(ImmutableList.of(row3), iterator); + + Row row4 = Row.of(4, "ddd", "2021-01-01"); + insertRows(table, row4); + assertRows(ImmutableList.of(row4), iterator); + } + result.getJobClient().ifPresent(JobClient::cancel); + } + + @TestTemplate + /** + * Insert records on the main branch. Then, insert in a named branch. Reads from the main branch + * and assert that the only records from main are returned + */ + public void testConsumeFilesFromMainBranch() throws Exception { + sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); + + // Produce two snapshots on main branch + Row row1 = Row.of(1, "aaa", "2021-01-01"); + Row row2 = Row.of(2, "bbb", "2021-01-01"); + + insertRows(table, row1, row2); + String branchName = "b1"; + table.manageSnapshots().createBranch(branchName).commit(); + + // insert on the 'b1' branch + Row row3 = Row.of(3, "ccc", "2021-01-01"); + Row row4 = Row.of(4, "ddd", "2021-01-01"); + + insertRowsInBranch(branchName, table, row3, row4); + + // read from main + TableResult result = + exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); + + try (CloseableIterator iterator = result.collect()) { + // the start snapshot(row2) is exclusive. + assertRows(ImmutableList.of(row1, row2), iterator); + + Row row5 = Row.of(5, "eee", "2021-01-01"); + Row row6 = Row.of(6, "fff", "2021-01-01"); + insertRows(table, row5, row6); + assertRows(ImmutableList.of(row5, row6), iterator); + + Row row7 = Row.of(7, "ggg", "2021-01-01"); + insertRows(table, row7); + assertRows(ImmutableList.of(row7), iterator); + } + result.getJobClient().ifPresent(JobClient::cancel); + } + + @TestTemplate + /** + * Insert records on the main branch. Creates a named branch. Insert record on named branch. Then + * select from the named branch and assert all the records are returned. + */ + public void testConsumeFilesFromBranch() throws Exception { + sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); + + // Produce two snapshots on main branch + Row row1 = Row.of(1, "aaa", "2021-01-01"); + Row row2 = Row.of(2, "bbb", "2021-01-01"); + + insertRows(table, row1, row2); + String branchName = "b1"; + table.manageSnapshots().createBranch(branchName).commit(); + + TableResult result = + exec( + "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'branch'='%s')*/ ", + TABLE, branchName); + + try (CloseableIterator iterator = result.collect()) { + assertRows(ImmutableList.of(row1, row2), iterator); + // insert on the 'b1' branch + Row row3 = Row.of(3, "ccc", "2021-01-01"); + Row row4 = Row.of(4, "ddd", "2021-01-01"); + insertRowsInBranch(branchName, table, row3, row4); + assertRows(ImmutableList.of(row3, row4), iterator); + } + result.getJobClient().ifPresent(JobClient::cancel); + } + + @TestTemplate + /** + * Insert records on branch b1. Then insert record on b2. Then select from each branch and assert + * the correct records are returned + */ + public void testConsumeFilesFromTwoBranches() throws Exception { + sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); + + String branch1 = "b1"; + String branch2 = "b2"; + table.manageSnapshots().createBranch(branch1).commit(); + table.manageSnapshots().createBranch(branch2).commit(); + + // Produce two snapshots on main branch + Row row1Branch1 = Row.of(1, "b1", "2021-01-01"); + Row row2Branch1 = Row.of(2, "b1", "2021-01-01"); + + Row row1Branch2 = Row.of(2, "b2", "2021-01-01"); + Row row2Branch2 = Row.of(3, "b3", "2021-01-01"); + + insertRowsInBranch(branch1, table, row1Branch1, row2Branch1); + insertRowsInBranch(branch2, table, row1Branch2, row2Branch2); + + TableResult resultBranch1 = + exec( + "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'branch'='%s')*/ ", + TABLE, branch1); + + try (CloseableIterator iterator = resultBranch1.collect()) { + assertRows(ImmutableList.of(row1Branch1, row2Branch1), iterator); + Row another = Row.of(4, "ccc", "2021-01-01"); + insertRowsInBranch(branch1, table, another); + assertRows(ImmutableList.of(another), iterator); + } + + TableResult resultBranch2 = + exec( + "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'branch'='%s')*/ ", + TABLE, branch2); + try (CloseableIterator iterator = resultBranch2.collect()) { + assertRows(ImmutableList.of(row1Branch2, row2Branch2), iterator); + Row another = Row.of(4, "ccc", "2021-01-01"); + insertRowsInBranch(branch2, table, another); + assertRows(ImmutableList.of(another), iterator); + } + + resultBranch1.getJobClient().ifPresent(JobClient::cancel); + resultBranch2.getJobClient().ifPresent(JobClient::cancel); + } + + @TestTemplate + public void testConsumeFromStartSnapshotId() throws Exception { + sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); + + // Produce two snapshots. + Row row1 = Row.of(1, "aaa", "2021-01-01"); + Row row2 = Row.of(2, "bbb", "2021-01-01"); + insertRows(table, row1); + insertRows(table, row2); + + long startSnapshotId = table.currentSnapshot().snapshotId(); + + Row row3 = Row.of(3, "ccc", "2021-01-01"); + Row row4 = Row.of(4, "ddd", "2021-01-01"); + insertRows(table, row3, row4); + + TableResult result = + exec( + "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', " + + "'start-snapshot-id'='%d')*/", + TABLE, startSnapshotId); + try (CloseableIterator iterator = result.collect()) { + // the start snapshot(row2) is exclusive. + assertRows(ImmutableList.of(row3, row4), iterator); + + Row row5 = Row.of(5, "eee", "2021-01-01"); + Row row6 = Row.of(6, "fff", "2021-01-01"); + insertRows(table, row5, row6); + assertRows(ImmutableList.of(row5, row6), iterator); + + Row row7 = Row.of(7, "ggg", "2021-01-01"); + insertRows(table, row7); + assertRows(ImmutableList.of(row7), iterator); + } + result.getJobClient().ifPresent(JobClient::cancel); + } + + @TestTemplate + public void testConsumeFromStartTag() throws Exception { + sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); + + // Produce two snapshots. + Row row1 = Row.of(1, "aaa", "2021-01-01"); + Row row2 = Row.of(2, "bbb", "2021-01-01"); + insertRows(table, row1); + insertRows(table, row2); + + String tagName = "t1"; + long startSnapshotId = table.currentSnapshot().snapshotId(); + table.manageSnapshots().createTag(tagName, startSnapshotId).commit(); + + Row row3 = Row.of(3, "ccc", "2021-01-01"); + Row row4 = Row.of(4, "ddd", "2021-01-01"); + insertRows(table, row3, row4); + + TableResult result = + exec( + "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', " + + "'start-tag'='%s')*/", + TABLE, tagName); + try (CloseableIterator iterator = result.collect()) { + // the start snapshot(row2) is exclusive. + assertRows(ImmutableList.of(row3, row4), iterator); + + Row row5 = Row.of(5, "eee", "2021-01-01"); + Row row6 = Row.of(6, "fff", "2021-01-01"); + insertRows(table, row5, row6); + assertRows(ImmutableList.of(row5, row6), iterator); + + Row row7 = Row.of(7, "ggg", "2021-01-01"); + insertRows(table, row7); + assertRows(ImmutableList.of(row7), iterator); + } + result.getJobClient().ifPresent(JobClient::cancel); + + assertThatThrownBy( + () -> + exec( + "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'start-tag'='%s', " + + "'start-snapshot-id'='%d' )*/", + TABLE, tagName, startSnapshotId)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("START_SNAPSHOT_ID and START_TAG cannot both be set."); + } + + @TestTemplate + void testWithParallelismWithProps() { + int customScanParallelism = defaultJobParallelism + 1; + sql( + "CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR) WITH ('scan.parallelism'='%s')", + TABLE, customScanParallelism); + + final org.apache.flink.table.api.Table table = + getTableEnv().sqlQuery(String.format("select * from %s", TABLE)); + final String explain = table.explain(ExplainDetail.JSON_EXECUTION_PLAN); + final String expectedPhysicalExecutionPlanFragment = + "\"parallelism\" : " + customScanParallelism; + assertThat(explain).contains(expectedPhysicalExecutionPlanFragment); + } + + @TestTemplate + void testWithParallelismWithHints() { + sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); + int customScanParallelism = defaultJobParallelism + 1; + + final org.apache.flink.table.api.Table table = + getTableEnv() + .sqlQuery( + String.format( + "select * from %s/*+ OPTIONS('streaming'='true', 'scan.parallelism'='%s') */", + TABLE, customScanParallelism)); + final String explain = table.explain(ExplainDetail.JSON_EXECUTION_PLAN); + final String expectedPhysicalExecutionPlanFragment = + "\"parallelism\" : " + customScanParallelism; + assertThat(explain).contains(expectedPhysicalExecutionPlanFragment); + } + + @TestTemplate + void testWithParallelismHintsOverride() { + int scanParallelismInCreateTable = defaultJobParallelism + 1; + sql( + "CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR) WITH ('scan.parallelism'='%s')", + TABLE, scanParallelismInCreateTable); + + int scanParallelismInHints = defaultJobParallelism + 2; + final org.apache.flink.table.api.Table table = + getTableEnv() + .sqlQuery( + String.format( + "select * from %s/*+ OPTIONS('streaming'='true', 'scan.parallelism'='%s') */", + TABLE, scanParallelismInHints)); + final String explain = table.explain(ExplainDetail.JSON_EXECUTION_PLAN); + final String expectedPhysicalExecutionPlanFragment = + "\"parallelism\" : " + scanParallelismInHints; + assertThat(explain).contains(expectedPhysicalExecutionPlanFragment); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java new file mode 100644 index 000000000000..3c747a05c16a --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java @@ -0,0 +1,399 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.File; +import java.io.IOException; +import java.time.Duration; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.CountDownLatch; +import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; +import org.apache.flink.streaming.api.functions.source.legacy.SourceFunction; +import org.apache.flink.streaming.api.operators.StreamSource; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.streaming.util.AbstractStreamOperatorTestHarness; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.Row; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.TestBase; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.flink.TestTableLoader; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.SnapshotUtil; +import org.apache.iceberg.util.ThreadPools; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestStreamingMonitorFunction extends TestBase { + + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + private static final FileFormat DEFAULT_FORMAT = FileFormat.PARQUET; + private static final long WAIT_TIME_MILLIS = 10 * 1000L; + + @Parameters(name = "formatVersion = {0}") + protected static List parameters() { + return Arrays.asList(1, 2); + } + + @BeforeEach + @Override + public void setupTable() throws IOException { + this.metadataDir = new File(tableDir, "metadata"); + + // Construct the iceberg table. + table = create(SCHEMA, PartitionSpec.unpartitioned()); + } + + private void runSourceFunctionInTask( + TestSourceContext sourceContext, StreamingMonitorFunction function) { + Thread task = + new Thread( + () -> { + try { + function.run(sourceContext); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + task.start(); + } + + @TestTemplate + public void testConsumeWithoutStartSnapshotId() throws Exception { + List> recordsList = generateRecordsAndCommitTxn(10); + ScanContext scanContext = ScanContext.builder().monitorInterval(Duration.ofMillis(100)).build(); + + StreamingMonitorFunction function = createFunction(scanContext); + try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { + harness.setup(); + harness.open(); + + TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); + runSourceFunctionInTask(sourceContext, function); + + awaitExpectedSplits(sourceContext); + + // Stop the stream task. + function.close(); + + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); + } + } + + @TestTemplate + public void testConsumeFromStartSnapshotId() throws Exception { + // Commit the first five transactions. + generateRecordsAndCommitTxn(5); + long startSnapshotId = table.currentSnapshot().snapshotId(); + + // Commit the next five transactions. + List> recordsList = generateRecordsAndCommitTxn(5); + + ScanContext scanContext = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .startSnapshotId(startSnapshotId) + .build(); + + StreamingMonitorFunction function = createFunction(scanContext); + try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { + harness.setup(); + harness.open(); + + TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); + runSourceFunctionInTask(sourceContext, function); + + awaitExpectedSplits(sourceContext); + + // Stop the stream task. + function.close(); + + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); + } + } + + @TestTemplate + public void testConsumeFromStartTag() throws Exception { + // Commit the first five transactions. + generateRecordsAndCommitTxn(5); + long startSnapshotId = table.currentSnapshot().snapshotId(); + String tagName = "t1"; + table.manageSnapshots().createTag(tagName, startSnapshotId).commit(); + + // Commit the next five transactions. + List> recordsList = generateRecordsAndCommitTxn(5); + + ScanContext scanContext = + ScanContext.builder().monitorInterval(Duration.ofMillis(100)).startTag(tagName).build(); + + StreamingMonitorFunction function = createFunction(scanContext); + try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { + harness.setup(); + harness.open(); + + TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); + runSourceFunctionInTask(sourceContext, function); + + awaitExpectedSplits(sourceContext); + + // Stop the stream task. + function.close(); + + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); + } + } + + @TestTemplate + public void testCheckpointRestore() throws Exception { + List> recordsList = generateRecordsAndCommitTxn(10); + ScanContext scanContext = ScanContext.builder().monitorInterval(Duration.ofMillis(100)).build(); + + StreamingMonitorFunction func = createFunction(scanContext); + OperatorSubtaskState state; + try (AbstractStreamOperatorTestHarness harness = createHarness(func)) { + harness.setup(); + harness.open(); + + TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); + runSourceFunctionInTask(sourceContext, func); + + awaitExpectedSplits(sourceContext); + + state = harness.snapshot(1, 1); + + // Stop the stream task. + func.close(); + + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); + } + + List> newRecordsList = generateRecordsAndCommitTxn(10); + StreamingMonitorFunction newFunc = createFunction(scanContext); + try (AbstractStreamOperatorTestHarness harness = createHarness(newFunc)) { + harness.setup(); + // Recover to process the remaining snapshots. + harness.initializeState(state); + harness.open(); + + TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); + runSourceFunctionInTask(sourceContext, newFunc); + + awaitExpectedSplits(sourceContext); + + // Stop the stream task. + newFunc.close(); + + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(newRecordsList)), SCHEMA); + } + } + + private void awaitExpectedSplits(TestSourceContext sourceContext) { + Awaitility.await("expected splits should be produced") + .atMost(Duration.ofMillis(WAIT_TIME_MILLIS)) + .untilAsserted( + () -> { + assertThat(sourceContext.latch.getCount()).isEqualTo(0); + assertThat(sourceContext.splits).as("Should produce the expected splits").hasSize(1); + }); + } + + @TestTemplate + public void testInvalidMaxPlanningSnapshotCount() { + ScanContext scanContext1 = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .maxPlanningSnapshotCount(0) + .build(); + + assertThatThrownBy(() -> createFunction(scanContext1)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("The max-planning-snapshot-count must be greater than zero"); + + ScanContext scanContext2 = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .maxPlanningSnapshotCount(-10) + .build(); + + assertThatThrownBy(() -> createFunction(scanContext2)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("The max-planning-snapshot-count must be greater than zero"); + } + + @TestTemplate + public void testConsumeWithMaxPlanningSnapshotCount() throws Exception { + generateRecordsAndCommitTxn(10); + + // Use the oldest snapshot as starting to avoid the initial case. + long oldestSnapshotId = SnapshotUtil.oldestAncestor(table).snapshotId(); + + ScanContext scanContext = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .splitSize(1000L) + .startSnapshotId(oldestSnapshotId) + .maxPlanningSnapshotCount(Integer.MAX_VALUE) + .build(); + + FlinkInputSplit[] expectedSplits = + FlinkSplitPlanner.planInputSplits(table, scanContext, ThreadPools.getWorkerPool()); + + assertThat(expectedSplits).hasSize(9); + + // This covers three cases that maxPlanningSnapshotCount is less than, equal or greater than the + // total splits number + for (int maxPlanningSnapshotCount : ImmutableList.of(1, 9, 15)) { + scanContext = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(500)) + .startSnapshotId(oldestSnapshotId) + .splitSize(1000L) + .maxPlanningSnapshotCount(maxPlanningSnapshotCount) + .build(); + + StreamingMonitorFunction function = createFunction(scanContext); + try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { + harness.setup(); + harness.open(); + + CountDownLatch latch = new CountDownLatch(1); + TestSourceContext sourceContext = new TestSourceContext(latch); + function.sourceContext(sourceContext); + function.monitorAndForwardSplits(); + + if (maxPlanningSnapshotCount < 10) { + assertThat(sourceContext.splits).hasSize(maxPlanningSnapshotCount); + } + } + } + } + + private List> generateRecordsAndCommitTxn(int commitTimes) throws IOException { + List> expectedRecords = Lists.newArrayList(); + for (int i = 0; i < commitTimes; i++) { + List records = RandomGenericData.generate(SCHEMA, 100, 0L); + expectedRecords.add(records); + + // Commit those records to iceberg table. + writeRecords(records); + } + return expectedRecords; + } + + private void writeRecords(List records) throws IOException { + GenericAppenderHelper appender = new GenericAppenderHelper(table, DEFAULT_FORMAT, temp); + appender.appendToTable(records); + } + + private StreamingMonitorFunction createFunction(ScanContext scanContext) { + return new StreamingMonitorFunction( + TestTableLoader.of(tableDir.getAbsolutePath()), scanContext); + } + + private AbstractStreamOperatorTestHarness createHarness( + StreamingMonitorFunction function) throws Exception { + StreamSource streamSource = + new StreamSource<>(function); + return new AbstractStreamOperatorTestHarness<>(streamSource, 1, 1, 0); + } + + private class TestSourceContext implements SourceFunction.SourceContext { + private final List splits = Lists.newArrayList(); + private final Object checkpointLock = new Object(); + private final CountDownLatch latch; + + TestSourceContext(CountDownLatch latch) { + this.latch = latch; + } + + @Override + public void collect(FlinkInputSplit element) { + splits.add(element); + latch.countDown(); + } + + @Override + public void collectWithTimestamp(FlinkInputSplit element, long timestamp) { + collect(element); + } + + @Override + public void emitWatermark(Watermark mark) {} + + @Override + public void markAsTemporarilyIdle() {} + + @Override + public Object getCheckpointLock() { + return checkpointLock; + } + + @Override + public void close() {} + + private List toRows() throws IOException { + FlinkInputFormat format = + FlinkSource.forRowData() + .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) + .buildFormat(); + + List rows = Lists.newArrayList(); + for (FlinkInputSplit split : splits) { + format.open(split); + + RowData element = null; + try { + while (!format.reachedEnd()) { + element = format.nextRecord(element); + rows.add(Row.of(element.getInt(0), element.getString(1).toString())); + } + } finally { + format.close(); + } + } + + return rows; + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java new file mode 100644 index 000000000000..56965417a54a --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java @@ -0,0 +1,288 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; +import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; +import org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor; +import org.apache.flink.streaming.runtime.tasks.mailbox.MailboxDefaultAction; +import org.apache.flink.streaming.runtime.tasks.mailbox.SteppingMailboxProcessor; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.Row; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.TestBase; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.flink.TestTableLoader; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.SnapshotUtil; +import org.apache.iceberg.util.ThreadPools; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestStreamingReaderOperator extends TestBase { + + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + private static final FileFormat DEFAULT_FORMAT = FileFormat.PARQUET; + + @Parameters(name = "formatVersion = {0}") + protected static List parameters() { + return Arrays.asList(1, 2); + } + + @BeforeEach + @Override + public void setupTable() throws IOException { + this.metadataDir = new File(tableDir, "metadata"); + + // Construct the iceberg table. + table = create(SCHEMA, PartitionSpec.unpartitioned()); + } + + @TestTemplate + public void testProcessAllRecords() throws Exception { + List> expectedRecords = generateRecordsAndCommitTxn(10); + + List splits = generateSplits(); + assertThat(splits).hasSize(10); + + try (OneInputStreamOperatorTestHarness harness = createReader()) { + harness.setup(); + harness.open(); + + SteppingMailboxProcessor processor = createLocalMailbox(harness); + + List expected = Lists.newArrayList(); + for (int i = 0; i < splits.size(); i++) { + // Process this element to enqueue to mail-box. + harness.processElement(splits.get(i), -1); + + // Run the mail-box once to read all records from the given split. + assertThat(processor.runMailboxStep()).as("Should processed 1 split").isTrue(); + + // Assert the output has expected elements. + expected.addAll(expectedRecords.get(i)); + TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); + } + } + } + + @TestTemplate + public void testTriggerCheckpoint() throws Exception { + // Received emitted splits: split1, split2, split3, checkpoint request is triggered when reading + // records from + // split1. + List> expectedRecords = generateRecordsAndCommitTxn(3); + + List splits = generateSplits(); + assertThat(splits).hasSize(3); + + long timestamp = 0; + try (OneInputStreamOperatorTestHarness harness = createReader()) { + harness.setup(); + harness.open(); + + SteppingMailboxProcessor processor = createLocalMailbox(harness); + + harness.processElement(splits.get(0), ++timestamp); + harness.processElement(splits.get(1), ++timestamp); + harness.processElement(splits.get(2), ++timestamp); + + // Trigger snapshot state, it will start to work once all records from split0 are read. + processor.getMainMailboxExecutor().execute(() -> harness.snapshot(1, 3), "Trigger snapshot"); + + assertThat(processor.runMailboxStep()).as("Should have processed the split0").isTrue(); + assertThat(processor.runMailboxStep()) + .as("Should have processed the snapshot state action") + .isTrue(); + + TestHelpers.assertRecords(readOutputValues(harness), expectedRecords.get(0), SCHEMA); + + // Read records from split1. + assertThat(processor.runMailboxStep()).as("Should have processed the split1").isTrue(); + + // Read records from split2. + assertThat(processor.runMailboxStep()).as("Should have processed the split2").isTrue(); + + TestHelpers.assertRecords( + readOutputValues(harness), Lists.newArrayList(Iterables.concat(expectedRecords)), SCHEMA); + } + } + + @TestTemplate + public void testCheckpointRestore() throws Exception { + List> expectedRecords = generateRecordsAndCommitTxn(15); + + List splits = generateSplits(); + assertThat(splits).hasSize(15); + + OperatorSubtaskState state; + List expected = Lists.newArrayList(); + try (OneInputStreamOperatorTestHarness harness = createReader()) { + harness.setup(); + harness.open(); + + // Enqueue all the splits. + for (FlinkInputSplit split : splits) { + harness.processElement(split, -1); + } + + // Read all records from the first five splits. + SteppingMailboxProcessor localMailbox = createLocalMailbox(harness); + for (int i = 0; i < 5; i++) { + expected.addAll(expectedRecords.get(i)); + assertThat(localMailbox.runMailboxStep()) + .as("Should have processed the split#" + i) + .isTrue(); + + TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); + } + + // Snapshot state now, there're 10 splits left in the state. + state = harness.snapshot(1, 1); + } + + expected.clear(); + try (OneInputStreamOperatorTestHarness harness = createReader()) { + harness.setup(); + // Recover to process the remaining splits. + harness.initializeState(state); + harness.open(); + + SteppingMailboxProcessor localMailbox = createLocalMailbox(harness); + + for (int i = 5; i < 10; i++) { + expected.addAll(expectedRecords.get(i)); + assertThat(localMailbox.runMailboxStep()) + .as("Should have processed the split#" + i) + .isTrue(); + + TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); + } + + // Let's process the final 5 splits now. + for (int i = 10; i < 15; i++) { + expected.addAll(expectedRecords.get(i)); + harness.processElement(splits.get(i), 1); + + assertThat(localMailbox.runMailboxStep()) + .as("Should have processed the split#" + i) + .isTrue(); + TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); + } + } + } + + private List readOutputValues( + OneInputStreamOperatorTestHarness harness) { + List results = Lists.newArrayList(); + for (RowData rowData : harness.extractOutputValues()) { + results.add(Row.of(rowData.getInt(0), rowData.getString(1).toString())); + } + return results; + } + + private List> generateRecordsAndCommitTxn(int commitTimes) throws IOException { + List> expectedRecords = Lists.newArrayList(); + for (int i = 0; i < commitTimes; i++) { + List records = RandomGenericData.generate(SCHEMA, 100, 0L); + expectedRecords.add(records); + + // Commit those records to iceberg table. + writeRecords(records); + } + return expectedRecords; + } + + private void writeRecords(List records) throws IOException { + GenericAppenderHelper appender = new GenericAppenderHelper(table, DEFAULT_FORMAT, temp); + appender.appendToTable(records); + } + + private List generateSplits() { + List inputSplits = Lists.newArrayList(); + + List snapshotIds = SnapshotUtil.currentAncestorIds(table); + for (int i = snapshotIds.size() - 1; i >= 0; i--) { + ScanContext scanContext; + if (i == snapshotIds.size() - 1) { + // Generate the splits from the first snapshot. + scanContext = ScanContext.builder().useSnapshotId(snapshotIds.get(i)).build(); + } else { + // Generate the splits between the previous snapshot and current snapshot. + scanContext = + ScanContext.builder() + .startSnapshotId(snapshotIds.get(i + 1)) + .endSnapshotId(snapshotIds.get(i)) + .build(); + } + + Collections.addAll( + inputSplits, + FlinkSplitPlanner.planInputSplits(table, scanContext, ThreadPools.getWorkerPool())); + } + + return inputSplits; + } + + private OneInputStreamOperatorTestHarness createReader() + throws Exception { + // This input format is used to opening the emitted split. + FlinkInputFormat inputFormat = + FlinkSource.forRowData() + .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) + .buildFormat(); + + OneInputStreamOperatorFactory factory = + StreamingReaderOperator.factory(inputFormat); + OneInputStreamOperatorTestHarness harness = + new OneInputStreamOperatorTestHarness<>(factory, 1, 1, 0); + + return harness; + } + + private SteppingMailboxProcessor createLocalMailbox( + OneInputStreamOperatorTestHarness harness) { + return new SteppingMailboxProcessor( + MailboxDefaultAction.Controller::suspendDefaultAction, + harness.getTaskMailbox(), + StreamTaskActionExecutor.IMMEDIATE); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java new file mode 100644 index 000000000000..1e612b0a2b2a --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.fail; + +import java.nio.file.Path; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.iceberg.flink.source.SplitHelpers; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public abstract class SplitAssignerTestBase { + @TempDir protected Path temporaryFolder; + + @Test + public void testEmptyInitialization() { + SplitAssigner assigner = splitAssigner(); + assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); + } + + /** Test a sequence of interactions for StaticEnumerator */ + @Test + public void testStaticEnumeratorSequence() throws Exception { + SplitAssigner assigner = splitAssigner(); + assigner.onDiscoveredSplits(createSplits(4, 1, "1")); + + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertSnapshot(assigner, 1); + assigner.onUnassignedSplits(createSplits(1, 1, "1")); + assertSnapshot(assigner, 2); + + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); + assertSnapshot(assigner, 0); + } + + /** Test a sequence of interactions for ContinuousEnumerator */ + @Test + public void testContinuousEnumeratorSequence() throws Exception { + SplitAssigner assigner = splitAssigner(); + assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); + + List splits1 = createSplits(1, 1, "1"); + assertAvailableFuture(assigner, 1, () -> assigner.onDiscoveredSplits(splits1)); + List splits2 = createSplits(1, 1, "1"); + assertAvailableFuture(assigner, 1, () -> assigner.onUnassignedSplits(splits2)); + + assigner.onDiscoveredSplits(createSplits(2, 1, "1")); + assertSnapshot(assigner, 2); + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); + assertSnapshot(assigner, 0); + } + + private void assertAvailableFuture( + SplitAssigner assigner, int splitCount, Runnable addSplitsRunnable) { + // register callback + AtomicBoolean futureCompleted = new AtomicBoolean(); + CompletableFuture future = assigner.isAvailable(); + future.thenAccept(ignored -> futureCompleted.set(true)); + // calling isAvailable again should return the same object reference + // note that thenAccept will return a new future. + // we want to assert the same instance on the assigner returned future + assertThat(assigner.isAvailable()).isSameAs(future); + + // now add some splits + addSplitsRunnable.run(); + assertThat(futureCompleted.get()).isTrue(); + + for (int i = 0; i < splitCount; ++i) { + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + } + assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); + assertSnapshot(assigner, 0); + } + + protected void assertGetNext(SplitAssigner assigner, GetSplitResult.Status expectedStatus) { + GetSplitResult result = assigner.getNext(null); + assertThat(result.status()).isEqualTo(expectedStatus); + switch (expectedStatus) { + case AVAILABLE: + assertThat(result.split()).isNotNull(); + break; + case CONSTRAINED: + case UNAVAILABLE: + assertThat(result.split()).isNull(); + break; + default: + fail("Unknown status: %s", expectedStatus); + } + } + + protected void assertSnapshot(SplitAssigner assigner, int splitCount) { + Collection stateBeforeGet = assigner.state(); + assertThat(stateBeforeGet).hasSize(splitCount); + } + + protected List createSplits(int fileCount, int filesPerSplit, String version) + throws Exception { + return SplitHelpers.createSplitsFromTransientHadoopTable( + temporaryFolder, fileCount, filesPerSplit, version); + } + + protected abstract SplitAssigner splitAssigner(); +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java new file mode 100644 index 000000000000..17e64bbf0594 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import org.apache.iceberg.flink.source.SplitHelpers; +import org.junit.jupiter.api.Test; + +public class TestDefaultSplitAssigner extends SplitAssignerTestBase { + @Override + protected SplitAssigner splitAssigner() { + return new DefaultSplitAssigner(null); + } + + /** Test the assigner when multiple files are in a single split */ + @Test + public void testMultipleFilesInASplit() throws Exception { + SplitAssigner assigner = splitAssigner(); + assigner.onDiscoveredSplits( + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 4, 2)); + + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertSnapshot(assigner, 1); + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); + assertSnapshot(assigner, 0); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java new file mode 100644 index 000000000000..2b65977fb2f9 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.List; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.SerializableComparator; +import org.apache.iceberg.flink.source.split.SplitComparators; +import org.apache.iceberg.util.SerializationUtil; +import org.junit.jupiter.api.Test; + +public class TestFileSequenceNumberBasedSplitAssigner extends SplitAssignerTestBase { + @Override + protected SplitAssigner splitAssigner() { + return new OrderedSplitAssignerFactory(SplitComparators.fileSequenceNumber()).createAssigner(); + } + + /** Test the assigner when multiple files are in a single split */ + @Test + public void testMultipleFilesInAnIcebergSplit() { + SplitAssigner assigner = splitAssigner(); + assertThatThrownBy( + () -> assigner.onDiscoveredSplits(createSplits(4, 2, "2")), + "Multiple files in a split is not allowed") + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Please use 'split-file-open-cost'"); + } + + /** Test sorted splits */ + @Test + public void testSplitSort() throws Exception { + SplitAssigner assigner = splitAssigner(); + List splits = createSplits(5, 1, "2"); + + assigner.onDiscoveredSplits(splits.subList(3, 5)); + assigner.onDiscoveredSplits(splits.subList(0, 1)); + assigner.onDiscoveredSplits(splits.subList(1, 3)); + + assertGetNext(assigner, 1L); + assertGetNext(assigner, 2L); + assertGetNext(assigner, 3L); + assertGetNext(assigner, 4L); + assertGetNext(assigner, 5L); + + assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); + } + + @Test + public void testSerializable() { + byte[] bytes = SerializationUtil.serializeToBytes(SplitComparators.fileSequenceNumber()); + SerializableComparator comparator = + SerializationUtil.deserializeFromBytes(bytes); + assertThat(comparator).isNotNull(); + } + + private void assertGetNext(SplitAssigner assigner, Long expectedSequenceNumber) { + GetSplitResult result = assigner.getNext(null); + ContentFile file = result.split().task().files().iterator().next().file(); + assertThat(file.fileSequenceNumber()).isEqualTo(expectedSequenceNumber); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java new file mode 100644 index 000000000000..84f04d5a530a --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.time.temporal.ChronoUnit; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.reader.ColumnStatsWatermarkExtractor; +import org.apache.iceberg.flink.source.reader.ReaderUtil; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.SerializableComparator; +import org.apache.iceberg.flink.source.split.SplitComparators; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.SerializationUtil; +import org.junit.jupiter.api.Test; + +public class TestWatermarkBasedSplitAssigner extends SplitAssignerTestBase { + public static final Schema SCHEMA = + new Schema(required(1, "timestamp_column", Types.TimestampType.withoutZone())); + private static final GenericAppenderFactory APPENDER_FACTORY = new GenericAppenderFactory(SCHEMA); + + @Override + protected SplitAssigner splitAssigner() { + return new OrderedSplitAssignerFactory( + SplitComparators.watermark( + new ColumnStatsWatermarkExtractor(SCHEMA, "timestamp_column", null))) + .createAssigner(); + } + + /** Test the assigner when multiple files are in a single split */ + @Test + public void testMultipleFilesInAnIcebergSplit() { + SplitAssigner assigner = splitAssigner(); + assigner.onDiscoveredSplits(createSplits(4, 2, "2")); + + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); + } + + /** Test sorted splits */ + @Test + public void testSplitSort() { + SplitAssigner assigner = splitAssigner(); + + Instant now = Instant.now(); + List splits = + IntStream.range(0, 5) + .mapToObj(i -> splitFromInstant(now.plus(i, ChronoUnit.MINUTES))) + .collect(Collectors.toList()); + + assigner.onDiscoveredSplits(splits.subList(3, 5)); + assigner.onDiscoveredSplits(splits.subList(0, 1)); + assigner.onDiscoveredSplits(splits.subList(1, 3)); + + assertGetNext(assigner, splits.get(0)); + assertGetNext(assigner, splits.get(1)); + assertGetNext(assigner, splits.get(2)); + assertGetNext(assigner, splits.get(3)); + assertGetNext(assigner, splits.get(4)); + + assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); + } + + @Test + public void testSerializable() { + byte[] bytes = + SerializationUtil.serializeToBytes( + SplitComparators.watermark( + new ColumnStatsWatermarkExtractor( + TestFixtures.SCHEMA, "id", TimeUnit.MILLISECONDS))); + SerializableComparator comparator = + SerializationUtil.deserializeFromBytes(bytes); + assertThat(comparator).isNotNull(); + } + + private void assertGetNext(SplitAssigner assigner, IcebergSourceSplit split) { + GetSplitResult result = assigner.getNext(null); + assertThat(split).isEqualTo(result.split()); + } + + @Override + protected List createSplits( + int fileCount, int filesPerSplit, String version) { + return IntStream.range(0, fileCount / filesPerSplit) + .mapToObj( + splitNum -> + splitFromRecords( + IntStream.range(0, filesPerSplit) + .mapToObj( + fileNum -> + RandomGenericData.generate( + SCHEMA, 2, (long) splitNum * filesPerSplit + fileNum)) + .collect(Collectors.toList()))) + .collect(Collectors.toList()); + } + + private IcebergSourceSplit splitFromInstant(Instant instant) { + Record record = GenericRecord.create(SCHEMA); + record.set(0, LocalDateTime.ofInstant(instant, ZoneOffset.UTC)); + return splitFromRecords(ImmutableList.of(ImmutableList.of(record))); + } + + private IcebergSourceSplit splitFromRecords(List> records) { + try { + return IcebergSourceSplit.fromCombinedScanTask( + ReaderUtil.createCombinedScanTask( + records, temporaryFolder, FileFormat.PARQUET, APPENDER_FACTORY)); + } catch (IOException e) { + throw new RuntimeException("Split creation exception", e); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java new file mode 100644 index 000000000000..ebc92df02360 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.io.IOException; +import java.util.List; +import java.util.NavigableMap; +import java.util.TreeMap; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +class ManualContinuousSplitPlanner implements ContinuousSplitPlanner { + private final int maxPlanningSnapshotCount; + // track splits per snapshot + private final NavigableMap> splits; + private long latestSnapshotId; + private int remainingFailures; + + ManualContinuousSplitPlanner(ScanContext scanContext, int expectedFailures) { + this.maxPlanningSnapshotCount = scanContext.maxPlanningSnapshotCount(); + this.splits = new TreeMap<>(); + this.latestSnapshotId = 0L; + this.remainingFailures = expectedFailures; + } + + @Override + public synchronized ContinuousEnumerationResult planSplits( + IcebergEnumeratorPosition lastPosition) { + if (remainingFailures > 0) { + remainingFailures--; + throw new RuntimeException("Expected failure at planning"); + } + + long fromSnapshotIdExclusive = 0; + if (lastPosition != null && lastPosition.snapshotId() != null) { + fromSnapshotIdExclusive = lastPosition.snapshotId(); + } + + Preconditions.checkArgument( + fromSnapshotIdExclusive <= latestSnapshotId, + "last enumerated snapshotId is greater than the latestSnapshotId"); + if (fromSnapshotIdExclusive == latestSnapshotId) { + // already discovered everything. + return new ContinuousEnumerationResult(Lists.newArrayList(), lastPosition, lastPosition); + } + + // find the subset of snapshots to return discovered splits + long toSnapshotIdInclusive; + if (latestSnapshotId - fromSnapshotIdExclusive > maxPlanningSnapshotCount) { + toSnapshotIdInclusive = fromSnapshotIdExclusive + maxPlanningSnapshotCount; + } else { + toSnapshotIdInclusive = latestSnapshotId; + } + + List discoveredSplits = Lists.newArrayList(); + NavigableMap> discoveredView = + splits.subMap(fromSnapshotIdExclusive, false, toSnapshotIdInclusive, true); + discoveredView.forEach((snapshotId, snapshotSplits) -> discoveredSplits.addAll(snapshotSplits)); + ContinuousEnumerationResult result = + new ContinuousEnumerationResult( + discoveredSplits, + lastPosition, + // use the snapshot Id as snapshot timestamp. + IcebergEnumeratorPosition.of(toSnapshotIdInclusive, toSnapshotIdInclusive)); + return result; + } + + /** + * Add a collection of new splits. A monotonically increased snapshotId is assigned to each batch + * of splits added by this method. + */ + public synchronized void addSplits(List newSplits) { + latestSnapshotId += 1; + splits.put(latestSnapshotId, newSplits); + } + + @Override + public void close() throws IOException {} +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java new file mode 100644 index 000000000000..41a787762fda --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java @@ -0,0 +1,352 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.flink.connector.testutils.source.reader.TestingSplitEnumeratorContext; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.SplitHelpers; +import org.apache.iceberg.flink.source.StreamingStartingStrategy; +import org.apache.iceberg.flink.source.assigner.DefaultSplitAssigner; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitStatus; +import org.apache.iceberg.flink.source.split.SplitRequestEvent; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestContinuousIcebergEnumerator { + @TempDir protected Path temporaryFolder; + + @Test + public void testDiscoverSplitWhenNoReaderRegistered() throws Exception { + TestingSplitEnumeratorContext enumeratorContext = + new TestingSplitEnumeratorContext<>(4); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 0); + ContinuousIcebergEnumerator enumerator = + createEnumerator(enumeratorContext, scanContext, splitPlanner); + + Collection pendingSplitsEmpty = + enumerator.snapshotState(1).pendingSplits(); + assertThat(pendingSplitsEmpty).isEmpty(); + + // make one split available and trigger the periodic discovery + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); + splitPlanner.addSplits(splits); + enumeratorContext.triggerAllActions(); + + Collection pendingSplits = enumerator.snapshotState(2).pendingSplits(); + assertThat(pendingSplits).hasSize(1); + IcebergSourceSplitState pendingSplit = pendingSplits.iterator().next(); + assertThat(pendingSplit.split().splitId()).isEqualTo(splits.get(0).splitId()); + assertThat(pendingSplit.status()).isEqualTo(IcebergSourceSplitStatus.UNASSIGNED); + } + + @Test + public void testDiscoverWhenReaderRegistered() throws Exception { + TestingSplitEnumeratorContext enumeratorContext = + new TestingSplitEnumeratorContext<>(4); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 0); + ContinuousIcebergEnumerator enumerator = + createEnumerator(enumeratorContext, scanContext, splitPlanner); + + // register one reader, and let it request a split + enumeratorContext.registerReader(2, "localhost"); + enumerator.addReader(2); + enumerator.handleSourceEvent(2, new SplitRequestEvent()); + + // make one split available and trigger the periodic discovery + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); + splitPlanner.addSplits(splits); + enumeratorContext.triggerAllActions(); + + assertThat(enumerator.snapshotState(1).pendingSplits()).isEmpty(); + assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) + .contains(splits.get(0)); + } + + @Test + public void testRequestingReaderUnavailableWhenSplitDiscovered() throws Exception { + TestingSplitEnumeratorContext enumeratorContext = + new TestingSplitEnumeratorContext<>(4); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 0); + ContinuousIcebergEnumerator enumerator = + createEnumerator(enumeratorContext, scanContext, splitPlanner); + + // register one reader, and let it request a split + enumeratorContext.registerReader(2, "localhost"); + enumerator.addReader(2); + enumerator.handleSourceEvent(2, new SplitRequestEvent()); + + // remove the reader (like in a failure) + enumeratorContext.registeredReaders().remove(2); + + // make one split available and trigger the periodic discovery + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); + assertThat(splits).hasSize(1); + splitPlanner.addSplits(splits); + enumeratorContext.triggerAllActions(); + + assertThat(enumeratorContext.getSplitAssignments()).doesNotContainKey(2); + List pendingSplitIds = + enumerator.snapshotState(1).pendingSplits().stream() + .map(IcebergSourceSplitState::split) + .map(IcebergSourceSplit::splitId) + .collect(Collectors.toList()); + assertThat(pendingSplitIds).hasSameSizeAs(splits).first().isEqualTo(splits.get(0).splitId()); + + // register the reader again, and let it request a split + enumeratorContext.registerReader(2, "localhost"); + enumerator.addReader(2); + enumerator.handleSourceEvent(2, new SplitRequestEvent()); + + assertThat(enumerator.snapshotState(2).pendingSplits()).isEmpty(); + assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) + .contains(splits.get(0)); + } + + @Test + public void testThrottlingDiscovery() throws Exception { + // create 10 splits + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 10, 1); + + TestingSplitEnumeratorContext enumeratorContext = + new TestingSplitEnumeratorContext<>(4); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + // discover one snapshot at a time + .maxPlanningSnapshotCount(1) + .build(); + ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 0); + ContinuousIcebergEnumerator enumerator = + createEnumerator(enumeratorContext, scanContext, splitPlanner); + + // register reader-2, and let it request a split + enumeratorContext.registerReader(2, "localhost"); + enumerator.addReader(2); + enumerator.handleSourceEvent(2, new SplitRequestEvent()); + + // add splits[0] to the planner for next discovery + splitPlanner.addSplits(Arrays.asList(splits.get(0))); + enumeratorContext.triggerAllActions(); + + // because discovered split was assigned to reader, pending splits should be empty + assertThat(enumerator.snapshotState(1).pendingSplits()).isEmpty(); + // split assignment to reader-2 should contain splits[0, 1) + assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) + .containsExactlyElementsOf(splits.subList(0, 1)); + + // add the remaining 9 splits (one for every snapshot) + // run discovery cycles while reader-2 still processing the splits[0] + for (int i = 1; i < 10; ++i) { + splitPlanner.addSplits(Arrays.asList(splits.get(i))); + enumeratorContext.triggerAllActions(); + } + + // can only discover up to 3 snapshots/splits + assertThat(enumerator.snapshotState(2).pendingSplits()).hasSize(3); + // split assignment to reader-2 should be splits[0, 1) + assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) + .containsExactlyElementsOf(splits.subList(0, 1)); + + // now reader-2 finished splits[0] + enumerator.handleSourceEvent(2, new SplitRequestEvent(Arrays.asList(splits.get(0).splitId()))); + enumeratorContext.triggerAllActions(); + // still have 3 pending splits. After assigned splits[1] to reader-2, one more split was + // discovered and added. + assertThat(enumerator.snapshotState(3).pendingSplits()).hasSize(3); + // split assignment to reader-2 should be splits[0, 2) + assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) + .containsExactlyElementsOf(splits.subList(0, 2)); + + // run 3 more split discovery cycles + for (int i = 0; i < 3; ++i) { + enumeratorContext.triggerAllActions(); + } + + // no more splits are discovered due to throttling + assertThat(enumerator.snapshotState(4).pendingSplits()).hasSize(3); + // split assignment to reader-2 should still be splits[0, 2) + assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) + .containsExactlyElementsOf(splits.subList(0, 2)); + + // now reader-2 finished splits[1] + enumerator.handleSourceEvent(2, new SplitRequestEvent(Arrays.asList(splits.get(1).splitId()))); + enumeratorContext.triggerAllActions(); + // still have 3 pending splits. After assigned new splits[2] to reader-2, one more split was + // discovered and added. + assertThat(enumerator.snapshotState(5).pendingSplits()).hasSize(3); + // split assignment to reader-2 should be splits[0, 3) + assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) + .containsExactlyElementsOf(splits.subList(0, 3)); + } + + @Test + public void testTransientPlanningErrorsWithSuccessfulRetry() throws Exception { + TestingSplitEnumeratorContext enumeratorContext = + new TestingSplitEnumeratorContext<>(4); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .maxPlanningSnapshotCount(1) + .maxAllowedPlanningFailures(2) + .build(); + ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 1); + ContinuousIcebergEnumerator enumerator = + createEnumerator(enumeratorContext, scanContext, splitPlanner); + + // Make one split available and trigger the periodic discovery + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); + splitPlanner.addSplits(splits); + + // Trigger a planning and check that no splits returned due to the planning error + enumeratorContext.triggerAllActions(); + assertThat(enumerator.snapshotState(2).pendingSplits()).isEmpty(); + + // Second scan planning should succeed and discover the expected splits + enumeratorContext.triggerAllActions(); + Collection pendingSplits = enumerator.snapshotState(3).pendingSplits(); + assertThat(pendingSplits).hasSize(1); + IcebergSourceSplitState pendingSplit = pendingSplits.iterator().next(); + assertThat(pendingSplit.split().splitId()).isEqualTo(splits.get(0).splitId()); + assertThat(pendingSplit.status()).isEqualTo(IcebergSourceSplitStatus.UNASSIGNED); + } + + @Test + public void testOverMaxAllowedPlanningErrors() throws Exception { + TestingSplitEnumeratorContext enumeratorContext = + new TestingSplitEnumeratorContext<>(4); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .maxPlanningSnapshotCount(1) + .maxAllowedPlanningFailures(1) + .build(); + ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 2); + createEnumerator(enumeratorContext, scanContext, splitPlanner); + + // Make one split available and trigger the periodic discovery + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); + splitPlanner.addSplits(splits); + + // Check that the scheduler response ignores the current error and continues to run until the + // failure limit is reached + enumeratorContext.triggerAllActions(); + assertThat(enumeratorContext.getExecutorService().getAllScheduledTasks().get(0).isDone()) + .isFalse(); + + // Check that the task has failed with the expected exception after the failure limit is reached + enumeratorContext.triggerAllActions(); + assertThat(enumeratorContext.getExecutorService().getAllScheduledTasks().get(0).isDone()) + .isTrue(); + assertThatThrownBy( + () -> enumeratorContext.getExecutorService().getAllScheduledTasks().get(0).get()) + .hasCauseInstanceOf(RuntimeException.class) + .hasMessageContaining("Failed to discover new split"); + } + + @Test + public void testPlanningIgnoringErrors() throws Exception { + int expectedFailures = 3; + TestingSplitEnumeratorContext enumeratorContext = + new TestingSplitEnumeratorContext<>(4); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .maxPlanningSnapshotCount(1) + .maxAllowedPlanningFailures(-1) + .build(); + ManualContinuousSplitPlanner splitPlanner = + new ManualContinuousSplitPlanner(scanContext, expectedFailures); + ContinuousIcebergEnumerator enumerator = + createEnumerator(enumeratorContext, scanContext, splitPlanner); + + // Make one split available and trigger the periodic discovery + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); + splitPlanner.addSplits(splits); + + Collection pendingSplits; + // Can not discover the new split with planning failures + for (int i = 0; i < expectedFailures; ++i) { + enumeratorContext.triggerAllActions(); + pendingSplits = enumerator.snapshotState(i).pendingSplits(); + assertThat(pendingSplits).isEmpty(); + } + + // Discovered the new split after a successful scan planning + enumeratorContext.triggerAllActions(); + pendingSplits = enumerator.snapshotState(expectedFailures + 1).pendingSplits(); + assertThat(pendingSplits).hasSize(1); + IcebergSourceSplitState pendingSplit = pendingSplits.iterator().next(); + assertThat(pendingSplit.split().splitId()).isEqualTo(splits.get(0).splitId()); + assertThat(pendingSplit.status()).isEqualTo(IcebergSourceSplitStatus.UNASSIGNED); + } + + private static ContinuousIcebergEnumerator createEnumerator( + SplitEnumeratorContext context, + ScanContext scanContext, + ContinuousSplitPlanner splitPlanner) { + + ContinuousIcebergEnumerator enumerator = + new ContinuousIcebergEnumerator( + context, + new DefaultSplitAssigner(null, Collections.emptyList()), + scanContext, + splitPlanner, + null); + enumerator.start(); + return enumerator; + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java new file mode 100644 index 000000000000..9a4bfa03e28b --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java @@ -0,0 +1,734 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import java.util.Set; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.HadoopTableExtension; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.StreamingStartingStrategy; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +public class TestContinuousSplitPlannerImpl { + @TempDir protected Path temporaryFolder; + + private static final FileFormat FILE_FORMAT = FileFormat.PARQUET; + private static final AtomicLong RANDOM_SEED = new AtomicLong(); + + @RegisterExtension + private static final HadoopTableExtension TABLE_RESOURCE = + new HadoopTableExtension(TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); + + private GenericAppenderHelper dataAppender; + private DataFile dataFile1; + private Snapshot snapshot1; + private DataFile dataFile2; + private Snapshot snapshot2; + + @BeforeEach + public void before() throws IOException { + dataAppender = new GenericAppenderHelper(TABLE_RESOURCE.table(), FILE_FORMAT, temporaryFolder); + } + + private void appendTwoSnapshots() throws IOException { + // snapshot1 + List batch1 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); + dataFile1 = dataAppender.writeFile(null, batch1); + dataAppender.appendToTable(dataFile1); + snapshot1 = TABLE_RESOURCE.table().currentSnapshot(); + + // snapshot2 + List batch2 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 1L); + dataFile2 = dataAppender.writeFile(null, batch2); + dataAppender.appendToTable(dataFile2); + snapshot2 = TABLE_RESOURCE.table().currentSnapshot(); + } + + /** + * @return the last enumerated snapshot id + */ + private CycleResult verifyOneCycle( + ContinuousSplitPlannerImpl splitPlanner, IcebergEnumeratorPosition lastPosition) + throws Exception { + List batch = + RandomGenericData.generate(TestFixtures.SCHEMA, 2, RANDOM_SEED.incrementAndGet()); + DataFile dataFile = dataAppender.writeFile(null, batch); + dataAppender.appendToTable(dataFile); + Snapshot snapshot = TABLE_RESOURCE.table().currentSnapshot(); + + ContinuousEnumerationResult result = splitPlanner.planSplits(lastPosition); + assertThat(result.fromPosition().snapshotId()).isEqualTo(lastPosition.snapshotId()); + assertThat(result.fromPosition().snapshotTimestampMs()) + .isEqualTo(lastPosition.snapshotTimestampMs()); + assertThat(result.toPosition().snapshotId().longValue()).isEqualTo(snapshot.snapshotId()); + assertThat(result.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot.timestampMillis()); + assertThat(result.splits()).hasSize(1); + IcebergSourceSplit split = Iterables.getOnlyElement(result.splits()); + assertThat(split.task().files()) + .hasSize(1) + .first() + .satisfies( + fileScanTask -> + assertThat(fileScanTask.file().location()).isEqualTo(dataFile.location())); + return new CycleResult(result.toPosition(), split); + } + + @Test + public void testTableScanThenIncrementalWithEmptyTable() throws Exception { + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult emptyTableInitialDiscoveryResult = splitPlanner.planSplits(null); + assertThat(emptyTableInitialDiscoveryResult.splits()).isEmpty(); + assertThat(emptyTableInitialDiscoveryResult.fromPosition()).isNull(); + assertThat(emptyTableInitialDiscoveryResult.toPosition().isEmpty()).isTrue(); + assertThat(emptyTableInitialDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); + + ContinuousEnumerationResult emptyTableSecondDiscoveryResult = + splitPlanner.planSplits(emptyTableInitialDiscoveryResult.toPosition()); + assertThat(emptyTableSecondDiscoveryResult.splits()).isEmpty(); + assertThat(emptyTableSecondDiscoveryResult.fromPosition().isEmpty()).isTrue(); + assertThat(emptyTableSecondDiscoveryResult.fromPosition().snapshotTimestampMs()).isNull(); + assertThat(emptyTableSecondDiscoveryResult.toPosition().isEmpty()).isTrue(); + assertThat(emptyTableSecondDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); + + // next 3 snapshots + IcebergEnumeratorPosition lastPosition = emptyTableSecondDiscoveryResult.toPosition(); + for (int i = 0; i < 3; ++i) { + lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; + } + } + + @Test + public void testTableScanThenIncrementalWithNonEmptyTable() throws Exception { + appendTwoSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); + assertThat(initialResult.fromPosition()).isNull(); + assertThat(initialResult.toPosition().snapshotId().longValue()) + .isEqualTo(snapshot2.snapshotId()); + assertThat(initialResult.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot2.timestampMillis()); + assertThat(initialResult.splits()).hasSize(1); + IcebergSourceSplit split = Iterables.getOnlyElement(initialResult.splits()); + assertThat(split.task().files()).hasSize(2); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().location()) + .collect(Collectors.toSet()); + Set expectedFiles = ImmutableSet.of(dataFile1.location(), dataFile2.location()); + assertThat(discoveredFiles).containsExactlyInAnyOrderElementsOf(expectedFiles); + + IcebergEnumeratorPosition lastPosition = initialResult.toPosition(); + for (int i = 0; i < 3; ++i) { + lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; + } + } + + @ParameterizedTest + @EnumSource( + value = StreamingStartingStrategy.class, + names = {"INCREMENTAL_FROM_LATEST_SNAPSHOT", "INCREMENTAL_FROM_LATEST_SNAPSHOT_EXCLUSIVE"}) + public void testIncrementalFromLatestSnapshotWithEmptyTable( + StreamingStartingStrategy startingStrategy) throws Exception { + ScanContext scanContext = + ScanContext.builder().startingStrategy(startingStrategy).splitSize(1L).build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult emptyTableInitialDiscoveryResult = splitPlanner.planSplits(null); + assertThat(emptyTableInitialDiscoveryResult.splits()).isEmpty(); + assertThat(emptyTableInitialDiscoveryResult.fromPosition()).isNull(); + assertThat(emptyTableInitialDiscoveryResult.toPosition().isEmpty()).isTrue(); + assertThat(emptyTableInitialDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); + + ContinuousEnumerationResult emptyTableSecondDiscoveryResult = + splitPlanner.planSplits(emptyTableInitialDiscoveryResult.toPosition()); + assertThat(emptyTableSecondDiscoveryResult.splits()).isEmpty(); + assertThat(emptyTableSecondDiscoveryResult.fromPosition().isEmpty()).isTrue(); + assertThat(emptyTableSecondDiscoveryResult.fromPosition().snapshotTimestampMs()).isNull(); + assertThat(emptyTableSecondDiscoveryResult.toPosition().isEmpty()).isTrue(); + assertThat(emptyTableSecondDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); + + // latest mode should discover both snapshots, as latest position is marked by when job starts + appendTwoSnapshots(); + ContinuousEnumerationResult afterTwoSnapshotsAppended = + splitPlanner.planSplits(emptyTableSecondDiscoveryResult.toPosition()); + assertThat(afterTwoSnapshotsAppended.splits()).hasSize(2); + + // next 3 snapshots + IcebergEnumeratorPosition lastPosition = afterTwoSnapshotsAppended.toPosition(); + for (int i = 0; i < 3; ++i) { + lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; + } + } + + @Test + public void testIncrementalFromLatestSnapshotWithNonEmptyTable() throws Exception { + appendTwoSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); + assertThat(initialResult.fromPosition()).isNull(); + // For inclusive behavior, the initial result should point to snapshot1 + // Then the next incremental scan shall discover files from latest snapshot2 (inclusive) + assertThat(initialResult.toPosition().snapshotId().longValue()) + .isEqualTo(snapshot1.snapshotId()); + assertThat(initialResult.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot1.timestampMillis()); + assertThat(initialResult.splits()).isEmpty(); + + ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); + assertThat(secondResult.fromPosition().snapshotId().longValue()) + .isEqualTo(snapshot1.snapshotId()); + assertThat(secondResult.fromPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot1.timestampMillis()); + assertThat(secondResult.toPosition().snapshotId().longValue()) + .isEqualTo(snapshot2.snapshotId()); + assertThat(secondResult.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot2.timestampMillis()); + IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); + assertThat(split.task().files()).hasSize(1); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().location()) + .collect(Collectors.toSet()); + // should discover dataFile2 appended in snapshot2 + Set expectedFiles = ImmutableSet.of(dataFile2.location()); + assertThat(discoveredFiles).containsExactlyElementsOf(expectedFiles); + + IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); + for (int i = 0; i < 3; ++i) { + lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; + } + } + + @Test + public void testIncrementalFromLatestSnapshotExclusiveWithNonEmptyTable() throws Exception { + appendTwoSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT_EXCLUSIVE) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); + assertThat(initialResult.splits()).isEmpty(); + assertThat(initialResult.fromPosition()).isNull(); + // For exclusive behavior, the initial result should point to snapshot2 + assertThat(initialResult.toPosition().snapshotId().longValue()) + .isEqualTo(snapshot2.snapshotId()); + assertThat(initialResult.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot2.timestampMillis()); + + // Then the next incremental scan shall discover no files + ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); + assertThat(initialResult.splits()).isEmpty(); + assertThat(secondResult.fromPosition().snapshotId().longValue()) + .isEqualTo(snapshot2.snapshotId()); + assertThat(secondResult.fromPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot2.timestampMillis()); + assertThat(secondResult.toPosition().snapshotId().longValue()) + .isEqualTo(snapshot2.snapshotId()); + assertThat(secondResult.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot2.timestampMillis()); + + IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); + for (int i = 0; i < 3; ++i) { + lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; + } + } + + @Test + public void testIncrementalFromEarliestSnapshotWithEmptyTable() throws Exception { + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult emptyTableInitialDiscoveryResult = splitPlanner.planSplits(null); + assertThat(emptyTableInitialDiscoveryResult.splits()).isEmpty(); + assertThat(emptyTableInitialDiscoveryResult.fromPosition()).isNull(); + assertThat(emptyTableInitialDiscoveryResult.toPosition().snapshotId()).isNull(); + assertThat(emptyTableInitialDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); + + ContinuousEnumerationResult emptyTableSecondDiscoveryResult = + splitPlanner.planSplits(emptyTableInitialDiscoveryResult.toPosition()); + assertThat(emptyTableSecondDiscoveryResult.splits()).isEmpty(); + assertThat(emptyTableSecondDiscoveryResult.fromPosition().snapshotId()).isNull(); + assertThat(emptyTableSecondDiscoveryResult.fromPosition().snapshotTimestampMs()).isNull(); + assertThat(emptyTableSecondDiscoveryResult.toPosition().snapshotId()).isNull(); + assertThat(emptyTableSecondDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); + + // next 3 snapshots + IcebergEnumeratorPosition lastPosition = emptyTableSecondDiscoveryResult.toPosition(); + for (int i = 0; i < 3; ++i) { + lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; + } + } + + @Test + public void testIncrementalFromEarliestSnapshotWithNonEmptyTable() throws Exception { + appendTwoSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); + assertThat(initialResult.fromPosition()).isNull(); + // For inclusive behavior, the initial result should point to snapshot1's parent, + // which leads to null snapshotId and snapshotTimestampMs. + assertThat(initialResult.toPosition().snapshotId()).isNull(); + assertThat(initialResult.toPosition().snapshotTimestampMs()).isNull(); + assertThat(initialResult.splits()).isEmpty(); + + ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); + assertThat(secondResult.fromPosition().snapshotId()).isNull(); + assertThat(secondResult.fromPosition().snapshotTimestampMs()).isNull(); + assertThat(secondResult.toPosition().snapshotId().longValue()) + .isEqualTo(snapshot2.snapshotId()); + assertThat(secondResult.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot2.timestampMillis()); + IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); + assertThat(split.task().files()).hasSize(2); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().location()) + .collect(Collectors.toSet()); + // should discover files appended in both snapshot1 and snapshot2 + Set expectedFiles = ImmutableSet.of(dataFile1.location(), dataFile2.location()); + assertThat(discoveredFiles).containsExactlyInAnyOrderElementsOf(expectedFiles); + + IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); + for (int i = 0; i < 3; ++i) { + lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; + } + } + + @Test + public void testIncrementalFromSnapshotIdWithEmptyTable() { + ScanContext scanContextWithInvalidSnapshotId = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(1L) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl( + TABLE_RESOURCE.tableLoader().clone(), scanContextWithInvalidSnapshotId, null); + + assertThatThrownBy(() -> splitPlanner.planSplits(null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Start snapshot id not found in history: 1"); + } + + @Test + public void testIncrementalFromSnapshotIdWithInvalidIds() throws Exception { + appendTwoSnapshots(); + + // find an invalid snapshotId + long invalidSnapshotId = 0L; + while (invalidSnapshotId == snapshot1.snapshotId() + || invalidSnapshotId == snapshot2.snapshotId()) { + invalidSnapshotId++; + } + + ScanContext scanContextWithInvalidSnapshotId = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(invalidSnapshotId) + .build(); + + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl( + TABLE_RESOURCE.tableLoader().clone(), scanContextWithInvalidSnapshotId, null); + + assertThatThrownBy(() -> splitPlanner.planSplits(null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Start snapshot id not found in history: " + invalidSnapshotId); + } + + @Test + public void testIncrementalFromSnapshotId() throws Exception { + appendTwoSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(snapshot2.snapshotId()) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); + assertThat(initialResult.fromPosition()).isNull(); + // For inclusive behavior of snapshot2, the initial result should point to snapshot1 (as + // snapshot2's parent) + assertThat(initialResult.toPosition().snapshotId().longValue()) + .isEqualTo(snapshot1.snapshotId()); + assertThat(initialResult.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot1.timestampMillis()); + assertThat(initialResult.splits()).isEmpty(); + + ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); + assertThat(secondResult.fromPosition().snapshotId().longValue()) + .isEqualTo(snapshot1.snapshotId()); + assertThat(secondResult.fromPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot1.timestampMillis()); + assertThat(secondResult.toPosition().snapshotId().longValue()) + .isEqualTo(snapshot2.snapshotId()); + assertThat(secondResult.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot2.timestampMillis()); + IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); + assertThat(split.task().files()).hasSize(1); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().location()) + .collect(Collectors.toSet()); + // should discover dataFile2 appended in snapshot2 + Set expectedFiles = ImmutableSet.of(dataFile2.location()); + assertThat(discoveredFiles).containsExactlyElementsOf(expectedFiles); + + IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); + for (int i = 0; i < 3; ++i) { + lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; + } + } + + @Test + public void testIncrementalFromSnapshotTimestampWithEmptyTable() { + ScanContext scanContextWithInvalidSnapshotId = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(1L) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl( + TABLE_RESOURCE.tableLoader().clone(), scanContextWithInvalidSnapshotId, null); + + assertThatThrownBy(() -> splitPlanner.planSplits(null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot find a snapshot after: 1"); + } + + @Test + public void testIncrementalFromSnapshotTimestampWithInvalidIds() throws Exception { + appendTwoSnapshots(); + + long invalidSnapshotTimestampMs = snapshot2.timestampMillis() + 1000L; + + ScanContext scanContextWithInvalidSnapshotId = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(invalidSnapshotTimestampMs) + .build(); + + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl( + TABLE_RESOURCE.tableLoader().clone(), scanContextWithInvalidSnapshotId, null); + + assertThatThrownBy(() -> splitPlanner.planSplits(null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot find a snapshot after:"); + } + + @Test + public void testIncrementalFromSnapshotTimestamp() throws Exception { + appendTwoSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(snapshot2.timestampMillis()) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); + assertThat(initialResult.fromPosition()).isNull(); + // For inclusive behavior, the initial result should point to snapshot1 (as snapshot2's parent). + assertThat(initialResult.toPosition().snapshotId().longValue()) + .isEqualTo(snapshot1.snapshotId()); + assertThat(initialResult.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot1.timestampMillis()); + assertThat(initialResult.splits()).isEmpty(); + + ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); + assertThat(secondResult.fromPosition().snapshotId().longValue()) + .isEqualTo(snapshot1.snapshotId()); + assertThat(secondResult.fromPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot1.timestampMillis()); + assertThat(secondResult.toPosition().snapshotId().longValue()) + .isEqualTo(snapshot2.snapshotId()); + assertThat(secondResult.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot2.timestampMillis()); + IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); + assertThat(split.task().files()).hasSize(1); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().location()) + .collect(Collectors.toSet()); + // should discover dataFile2 appended in snapshot2 + Set expectedFiles = ImmutableSet.of(dataFile2.location()); + assertThat(discoveredFiles).containsExactlyElementsOf(expectedFiles); + + IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); + for (int i = 0; i < 3; ++i) { + lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; + } + } + + @Test + public void testMaxPlanningSnapshotCount() throws Exception { + appendTwoSnapshots(); + // append 3 more snapshots + for (int i = 2; i < 5; ++i) { + appendSnapshot(i, 2); + } + + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + // limit to 1 snapshot per discovery + .maxPlanningSnapshotCount(1) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); + assertThat(initialResult.fromPosition()).isNull(); + // For inclusive behavior, the initial result should point to snapshot1's parent, + // which leads to null snapshotId and snapshotTimestampMs. + assertThat(initialResult.toPosition().snapshotId()).isNull(); + assertThat(initialResult.toPosition().snapshotTimestampMs()).isNull(); + assertThat(initialResult.splits()).isEmpty(); + + ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); + // should discover dataFile1 appended in snapshot1 + verifyMaxPlanningSnapshotCountResult( + secondResult, null, snapshot1, ImmutableSet.of(dataFile1.location())); + + ContinuousEnumerationResult thirdResult = splitPlanner.planSplits(secondResult.toPosition()); + // should discover dataFile2 appended in snapshot2 + verifyMaxPlanningSnapshotCountResult( + thirdResult, snapshot1, snapshot2, ImmutableSet.of(dataFile2.location())); + } + + @Test + public void testTableScanNoStats() throws Exception { + appendTwoSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .includeColumnStats(false) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); + assertThat(initialResult.splits()).hasSize(1); + IcebergSourceSplit split = Iterables.getOnlyElement(initialResult.splits()); + assertThat(split.task().files()).hasSize(2); + verifyStatCount(split, 0); + + IcebergEnumeratorPosition lastPosition = initialResult.toPosition(); + for (int i = 0; i < 3; ++i) { + CycleResult result = verifyOneCycle(splitPlanner, lastPosition); + verifyStatCount(result.split, 0); + lastPosition = result.lastPosition; + } + } + + @Test + public void testTableScanAllStats() throws Exception { + appendTwoSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .includeColumnStats(true) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); + assertThat(initialResult.splits()).hasSize(1); + IcebergSourceSplit split = Iterables.getOnlyElement(initialResult.splits()); + assertThat(split.task().files()).hasSize(2); + verifyStatCount(split, 3); + + IcebergEnumeratorPosition lastPosition = initialResult.toPosition(); + for (int i = 0; i < 3; ++i) { + CycleResult result = verifyOneCycle(splitPlanner, lastPosition); + verifyStatCount(result.split, 3); + lastPosition = result.lastPosition; + } + } + + @Test + public void testTableScanSingleStat() throws Exception { + appendTwoSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .includeColumnStats(ImmutableSet.of("data")) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); + assertThat(initialResult.splits()).hasSize(1); + IcebergSourceSplit split = Iterables.getOnlyElement(initialResult.splits()); + assertThat(split.task().files()).hasSize(2); + verifyStatCount(split, 1); + + IcebergEnumeratorPosition lastPosition = initialResult.toPosition(); + for (int i = 0; i < 3; ++i) { + CycleResult result = verifyOneCycle(splitPlanner, lastPosition); + verifyStatCount(result.split, 1); + lastPosition = result.lastPosition; + } + } + + private void verifyStatCount(IcebergSourceSplit split, int expected) { + if (expected == 0) { + split + .task() + .files() + .forEach( + f -> { + assertThat(f.file().valueCounts()).isNull(); + assertThat(f.file().columnSizes()).isNull(); + assertThat(f.file().lowerBounds()).isNull(); + assertThat(f.file().upperBounds()).isNull(); + assertThat(f.file().nanValueCounts()).isNull(); + assertThat(f.file().nullValueCounts()).isNull(); + }); + } else { + split + .task() + .files() + .forEach( + f -> { + assertThat(f.file().valueCounts()).hasSize(expected); + assertThat(f.file().columnSizes()).hasSize(expected); + assertThat(f.file().lowerBounds()).hasSize(expected); + assertThat(f.file().upperBounds()).hasSize(expected); + assertThat(f.file().nullValueCounts()).hasSize(expected); + // The nanValue is not counted for long and string fields + assertThat(f.file().nanValueCounts()).isEmpty(); + }); + } + } + + private void verifyMaxPlanningSnapshotCountResult( + ContinuousEnumerationResult result, + Snapshot fromSnapshotExclusive, + Snapshot toSnapshotInclusive, + Set expectedFiles) { + if (fromSnapshotExclusive == null) { + assertThat(result.fromPosition().snapshotId()).isNull(); + assertThat(result.fromPosition().snapshotTimestampMs()).isNull(); + } else { + assertThat(result.fromPosition().snapshotId().longValue()) + .isEqualTo(fromSnapshotExclusive.snapshotId()); + assertThat(result.fromPosition().snapshotTimestampMs().longValue()) + .isEqualTo(fromSnapshotExclusive.timestampMillis()); + } + assertThat(result.toPosition().snapshotId().longValue()) + .isEqualTo(toSnapshotInclusive.snapshotId()); + assertThat(result.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(toSnapshotInclusive.timestampMillis()); + // should only have one split with one data file, because split discover is limited to + // one snapshot and each snapshot has only one data file appended. + IcebergSourceSplit split = Iterables.getOnlyElement(result.splits()); + assertThat(split.task().files()).hasSize(1); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().location()) + .collect(Collectors.toSet()); + assertThat(discoveredFiles).containsExactlyElementsOf(expectedFiles); + } + + private Snapshot appendSnapshot(long seed, int numRecords) throws Exception { + List batch = RandomGenericData.generate(TestFixtures.SCHEMA, numRecords, seed); + DataFile dataFile = dataAppender.writeFile(null, batch); + dataAppender.appendToTable(dataFile); + return TABLE_RESOURCE.table().currentSnapshot(); + } + + private static class CycleResult { + IcebergEnumeratorPosition lastPosition; + IcebergSourceSplit split; + + CycleResult(IcebergEnumeratorPosition lastPosition, IcebergSourceSplit split) { + this.lastPosition = lastPosition; + this.split = split; + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java new file mode 100644 index 000000000000..9b59e85d2afb --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java @@ -0,0 +1,219 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.HadoopTableExtension; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.StreamingStartingStrategy; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +public class TestContinuousSplitPlannerImplStartStrategy { + private static final FileFormat FILE_FORMAT = FileFormat.PARQUET; + + @TempDir protected Path temporaryFolder; + + @RegisterExtension + private static final HadoopTableExtension TABLE_RESOURCE = + new HadoopTableExtension(TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); + + private GenericAppenderHelper dataAppender; + private Snapshot snapshot1; + private Snapshot snapshot2; + private Snapshot snapshot3; + + @BeforeEach + public void before() throws IOException { + dataAppender = new GenericAppenderHelper(TABLE_RESOURCE.table(), FILE_FORMAT, temporaryFolder); + } + + private void appendThreeSnapshots() throws IOException { + List batch1 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); + dataAppender.appendToTable(batch1); + snapshot1 = TABLE_RESOURCE.table().currentSnapshot(); + + List batch2 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 1L); + dataAppender.appendToTable(batch2); + snapshot2 = TABLE_RESOURCE.table().currentSnapshot(); + + List batch3 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 2L); + dataAppender.appendToTable(batch3); + snapshot3 = TABLE_RESOURCE.table().currentSnapshot(); + } + + @Test + public void testTableScanThenIncrementalStrategy() throws IOException { + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + + assertThat(ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext)) + .isNotPresent(); + + appendThreeSnapshots(); + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); + assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot3.snapshotId()); + } + + @ParameterizedTest + @EnumSource( + value = StreamingStartingStrategy.class, + names = {"INCREMENTAL_FROM_LATEST_SNAPSHOT", "INCREMENTAL_FROM_LATEST_SNAPSHOT_EXCLUSIVE"}) + public void testForLatestSnapshotStrategyWithEmptyTable( + StreamingStartingStrategy startingStrategy) throws IOException { + ScanContext scanContext = + ScanContext.builder().streaming(true).startingStrategy(startingStrategy).build(); + + assertThat(ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext)) + .isNotPresent(); + + appendThreeSnapshots(); + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); + assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot3.snapshotId()); + } + + @ParameterizedTest + @EnumSource( + value = StreamingStartingStrategy.class, + names = {"INCREMENTAL_FROM_LATEST_SNAPSHOT", "INCREMENTAL_FROM_LATEST_SNAPSHOT_EXCLUSIVE"}) + public void testForLatestSnapshotStrategyWithNonEmptyTable( + StreamingStartingStrategy startingStrategy) throws IOException { + appendThreeSnapshots(); + + ScanContext scanContext = + ScanContext.builder().streaming(true).startingStrategy(startingStrategy).build(); + + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); + assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot3.snapshotId()); + } + + @Test + public void testForEarliestSnapshotStrategy() throws IOException { + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .build(); + + assertThat(ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext)) + .isNotPresent(); + + appendThreeSnapshots(); + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); + assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot1.snapshotId()); + } + + @Test + public void testForSpecificSnapshotIdStrategy() throws IOException { + ScanContext scanContextInvalidSnapshotId = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(1L) + .build(); + + assertThatThrownBy( + () -> + ContinuousSplitPlannerImpl.startSnapshot( + TABLE_RESOURCE.table(), scanContextInvalidSnapshotId)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Start snapshot id not found in history: 1"); + + appendThreeSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(snapshot2.snapshotId()) + .build(); + + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); + assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot2.snapshotId()); + } + + @Test + public void testForSpecificSnapshotTimestampStrategySnapshot2() throws IOException { + ScanContext scanContextInvalidSnapshotTimestamp = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(1L) + .build(); + + assertThatThrownBy( + () -> + ContinuousSplitPlannerImpl.startSnapshot( + TABLE_RESOURCE.table(), scanContextInvalidSnapshotTimestamp)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot find a snapshot after: "); + + appendThreeSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(snapshot2.timestampMillis()) + .build(); + + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); + assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot2.snapshotId()); + } + + @Test + public void testForSpecificSnapshotTimestampStrategySnapshot2Minus1() throws IOException { + appendThreeSnapshots(); + + ScanContext config = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(snapshot2.timestampMillis() - 1L) + .build(); + + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), config).get(); + assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot2.snapshotId()); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java new file mode 100644 index 000000000000..feefcb98646b --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.junit.jupiter.api.Test; + +public class TestEnumerationHistory { + private static final int MAX_HISTORY_SIZE = 3; + private static final int FEW_PENDING_SPLITS = 2; + private static final int TOO_MANY_PENDING_SPLITS = 100; + + @Test + public void testEmptyHistory() { + EnumerationHistory history = new EnumerationHistory(MAX_HISTORY_SIZE); + int[] expectedHistorySnapshot = new int[0]; + testHistory(history, expectedHistorySnapshot); + } + + @Test + public void testNotFullHistory() { + EnumerationHistory history = new EnumerationHistory(3); + history.add(1); + history.add(2); + int[] expectedHistorySnapshot = {1, 2}; + testHistory(history, expectedHistorySnapshot); + } + + @Test + public void testExactFullHistory() { + EnumerationHistory history = new EnumerationHistory(3); + history.add(1); + history.add(2); + history.add(3); + int[] expectedHistorySnapshot = {1, 2, 3}; + testHistory(history, expectedHistorySnapshot); + } + + @Test + public void testOneMoreThanFullHistory() { + EnumerationHistory history = new EnumerationHistory(3); + history.add(1); + history.add(2); + history.add(3); + history.add(4); + int[] expectedHistorySnapshot = {2, 3, 4}; + testHistory(history, expectedHistorySnapshot); + } + + @Test + public void testTwoMoreThanFullHistory() { + EnumerationHistory history = new EnumerationHistory(3); + history.add(1); + history.add(2); + history.add(3); + history.add(4); + history.add(5); + int[] expectedHistorySnapshot = {3, 4, 5}; + testHistory(history, expectedHistorySnapshot); + } + + @Test + public void testThreeMoreThanFullHistory() { + EnumerationHistory history = new EnumerationHistory(3); + history.add(1); + history.add(2); + history.add(3); + history.add(4); + history.add(5); + history.add(6); + int[] expectedHistorySnapshot = {4, 5, 6}; + testHistory(history, expectedHistorySnapshot); + } + + private void testHistory(EnumerationHistory history, int[] expectedHistorySnapshot) { + assertThat(history.shouldPauseSplitDiscovery(FEW_PENDING_SPLITS)).isFalse(); + if (history.hasFullHistory()) { + // throttle because pending split count is more than the sum of enumeration history + assertThat(history.shouldPauseSplitDiscovery(TOO_MANY_PENDING_SPLITS)).isTrue(); + } else { + // skipped throttling check because there is not enough history + assertThat(history.shouldPauseSplitDiscovery(TOO_MANY_PENDING_SPLITS)).isFalse(); + } + + int[] historySnapshot = history.snapshot(); + assertThat(historySnapshot).containsExactly(expectedHistorySnapshot); + + EnumerationHistory restoredHistory = new EnumerationHistory(MAX_HISTORY_SIZE); + restoredHistory.restore(historySnapshot); + + assertThat(history.shouldPauseSplitDiscovery(FEW_PENDING_SPLITS)).isFalse(); + if (history.hasFullHistory()) { + // throttle because pending split count is more than the sum of enumeration history + assertThat(history.shouldPauseSplitDiscovery(TOO_MANY_PENDING_SPLITS)).isTrue(); + } else { + // skipped throttling check because there is not enough history + assertThat(history.shouldPauseSplitDiscovery(30)).isFalse(); + } + } + + @Test + public void testRestoreDifferentSize() { + EnumerationHistory history = new EnumerationHistory(3); + history.add(1); + history.add(2); + history.add(3); + int[] historySnapshot = history.snapshot(); + + EnumerationHistory smallerHistory = new EnumerationHistory(2); + smallerHistory.restore(historySnapshot); + int[] expectedRestoredHistorySnapshot = {2, 3}; + assertThat(smallerHistory.snapshot()).containsExactly(expectedRestoredHistorySnapshot); + + EnumerationHistory largerHisotry = new EnumerationHistory(4); + largerHisotry.restore(historySnapshot); + assertThat(largerHisotry.snapshot()).containsExactly(historySnapshot); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java new file mode 100644 index 000000000000..2520a6b763e4 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.flink.source.SplitHelpers; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitStatus; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestIcebergEnumeratorStateSerializer { + @TempDir protected Path temporaryFolder; + + private final IcebergEnumeratorStateSerializer serializer = + new IcebergEnumeratorStateSerializer(true); + + @Parameter(index = 0) + protected int version; + + @Parameters(name = "version={0}") + public static Object[][] parameters() { + return new Object[][] {new Object[] {1}, new Object[] {2}}; + } + + @TestTemplate + public void testEmptySnapshotIdAndPendingSplits() throws Exception { + IcebergEnumeratorState enumeratorState = new IcebergEnumeratorState(Collections.emptyList()); + testSerializer(enumeratorState); + } + + @TestTemplate + public void testSomeSnapshotIdAndEmptyPendingSplits() throws Exception { + IcebergEnumeratorPosition position = + IcebergEnumeratorPosition.of(1L, System.currentTimeMillis()); + + IcebergEnumeratorState enumeratorState = + new IcebergEnumeratorState(position, Collections.emptyList()); + testSerializer(enumeratorState); + } + + @TestTemplate + public void testSomeSnapshotIdAndPendingSplits() throws Exception { + IcebergEnumeratorPosition position = + IcebergEnumeratorPosition.of(2L, System.currentTimeMillis()); + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 3, 1); + Collection pendingSplits = Lists.newArrayList(); + pendingSplits.add( + new IcebergSourceSplitState(splits.get(0), IcebergSourceSplitStatus.UNASSIGNED)); + pendingSplits.add( + new IcebergSourceSplitState(splits.get(1), IcebergSourceSplitStatus.ASSIGNED)); + pendingSplits.add( + new IcebergSourceSplitState(splits.get(2), IcebergSourceSplitStatus.COMPLETED)); + + IcebergEnumeratorState enumeratorState = new IcebergEnumeratorState(position, pendingSplits); + testSerializer(enumeratorState); + } + + @TestTemplate + public void testEnumerationSplitCountHistory() throws Exception { + if (version == 2) { + IcebergEnumeratorPosition position = + IcebergEnumeratorPosition.of(2L, System.currentTimeMillis()); + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 3, 1); + Collection pendingSplits = Lists.newArrayList(); + pendingSplits.add( + new IcebergSourceSplitState(splits.get(0), IcebergSourceSplitStatus.UNASSIGNED)); + pendingSplits.add( + new IcebergSourceSplitState(splits.get(1), IcebergSourceSplitStatus.ASSIGNED)); + pendingSplits.add( + new IcebergSourceSplitState(splits.get(2), IcebergSourceSplitStatus.COMPLETED)); + int[] enumerationSplitCountHistory = {1, 2, 3}; + + IcebergEnumeratorState enumeratorState = + new IcebergEnumeratorState(position, pendingSplits, enumerationSplitCountHistory); + testSerializer(enumeratorState); + } + } + + private void testSerializer(IcebergEnumeratorState enumeratorState) throws IOException { + byte[] result; + if (version == 1) { + result = serializer.serializeV1(enumeratorState); + } else { + result = serializer.serialize(enumeratorState); + } + + IcebergEnumeratorState deserialized = serializer.deserialize(version, result); + assertEnumeratorStateEquals(enumeratorState, deserialized); + } + + private void assertEnumeratorStateEquals( + IcebergEnumeratorState expected, IcebergEnumeratorState actual) { + assertThat(actual.lastEnumeratedPosition()).isEqualTo(expected.lastEnumeratedPosition()); + + assertThat(actual.pendingSplits()).hasSameSizeAs(expected.pendingSplits()); + Iterator expectedIterator = expected.pendingSplits().iterator(); + Iterator actualIterator = actual.pendingSplits().iterator(); + for (int i = 0; i < expected.pendingSplits().size(); ++i) { + IcebergSourceSplitState expectedSplitState = expectedIterator.next(); + IcebergSourceSplitState actualSplitState = actualIterator.next(); + assertThat(actualSplitState.split().splitId()) + .isEqualTo(expectedSplitState.split().splitId()); + assertThat(actualSplitState.split().fileOffset()) + .isEqualTo(expectedSplitState.split().fileOffset()); + assertThat(actualSplitState.split().recordOffset()) + .isEqualTo(expectedSplitState.split().recordOffset()); + assertThat(actualSplitState.status()).isEqualTo(expectedSplitState.status()); + } + + assertThat(actual.enumerationSplitCountHistory()) + .containsExactly(expected.enumerationSplitCountHistory()); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java new file mode 100644 index 000000000000..0d1d0ce3217c --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java @@ -0,0 +1,218 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public abstract class ReaderFunctionTestBase { + + @Parameters(name = "fileFormat={0}") + public static Object[][] parameters() { + return new Object[][] { + new Object[] {FileFormat.AVRO}, + new Object[] {FileFormat.ORC}, + new Object[] {FileFormat.PARQUET} + }; + } + + @TempDir protected Path temporaryFolder; + + protected abstract ReaderFunction readerFunction(); + + protected abstract void assertRecords(List expected, List actual, Schema schema); + + @Parameter(index = 0) + private FileFormat fileFormat; + + private final GenericAppenderFactory appenderFactory = + new GenericAppenderFactory(TestFixtures.SCHEMA); + + private void assertRecordsAndPosition( + List expectedRecords, + int expectedFileOffset, + long startRecordOffset, + RecordsWithSplitIds> batch) { + batch.nextSplit(); + List actualRecords = Lists.newArrayList(); + long recordOffset = startRecordOffset; + RecordAndPosition recordAndPosition; + while ((recordAndPosition = batch.nextRecordFromSplit()) != null) { + actualRecords.add(recordAndPosition.record()); + assertThat(recordAndPosition.fileOffset()).isEqualTo(expectedFileOffset); + assertThat(recordAndPosition.recordOffset() - 1).isEqualTo(recordOffset); + recordOffset++; + } + + assertThat(actualRecords).hasSameSizeAs(expectedRecords); + assertRecords(expectedRecords, actualRecords, TestFixtures.SCHEMA); + } + + @TestTemplate + public void testNoCheckpointedPosition() throws IOException { + List> recordBatchList = + ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); + CombinedScanTask combinedScanTask = + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, fileFormat, appenderFactory); + IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask); + CloseableIterator>> reader = + readerFunction().apply(split); + + RecordsWithSplitIds> batch0 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(0), 0, 0L, batch0); + batch0.recycle(); + + RecordsWithSplitIds> batch1 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); + batch1.recycle(); + + RecordsWithSplitIds> batch2 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); + batch2.recycle(); + } + + @TestTemplate + public void testCheckpointedPositionBeforeFirstFile() throws IOException { + List> recordBatchList = + ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); + CombinedScanTask combinedScanTask = + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, fileFormat, appenderFactory); + IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 0, 0L); + CloseableIterator>> reader = + readerFunction().apply(split); + + RecordsWithSplitIds> batch0 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(0), 0, 0L, batch0); + batch0.recycle(); + + RecordsWithSplitIds> batch1 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); + batch1.recycle(); + + RecordsWithSplitIds> batch2 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); + batch2.recycle(); + } + + @TestTemplate + public void testCheckpointedPositionMiddleFirstFile() throws IOException { + List> recordBatchList = + ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); + CombinedScanTask combinedScanTask = + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, fileFormat, appenderFactory); + IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 0, 1L); + CloseableIterator>> reader = + readerFunction().apply(split); + + RecordsWithSplitIds> batch0 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(0).subList(1, 2), 0, 1L, batch0); + batch0.recycle(); + + RecordsWithSplitIds> batch1 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); + batch1.recycle(); + + RecordsWithSplitIds> batch2 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); + batch2.recycle(); + } + + @TestTemplate + public void testCheckpointedPositionAfterFirstFile() throws IOException { + List> recordBatchList = + ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); + CombinedScanTask combinedScanTask = + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, fileFormat, appenderFactory); + IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 0, 2L); + CloseableIterator>> reader = + readerFunction().apply(split); + + RecordsWithSplitIds> batch1 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); + batch1.recycle(); + + RecordsWithSplitIds> batch2 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); + batch2.recycle(); + } + + @TestTemplate + public void testCheckpointedPositionBeforeSecondFile() throws IOException { + List> recordBatchList = + ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); + CombinedScanTask combinedScanTask = + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, fileFormat, appenderFactory); + IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 1, 0L); + CloseableIterator>> reader = + readerFunction().apply(split); + + RecordsWithSplitIds> batch1 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); + batch1.recycle(); + + RecordsWithSplitIds> batch2 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); + batch2.recycle(); + } + + @TestTemplate + public void testCheckpointedPositionMidSecondFile() throws IOException { + List> recordBatchList = + ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); + CombinedScanTask combinedScanTask = + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, fileFormat, appenderFactory); + IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 1, 1L); + CloseableIterator>> reader = + readerFunction().apply(split); + + RecordsWithSplitIds> batch1 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(1).subList(1, 2), 1, 1L, batch1); + batch1.recycle(); + + RecordsWithSplitIds> batch2 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); + batch2.recycle(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java new file mode 100644 index 000000000000..0edf8ae009fe --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.util.Collections; +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.BaseCombinedScanTask; +import org.apache.iceberg.BaseFileScanTask; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.Files; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.PartitionSpecParser; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.encryption.PlaintextEncryptionManager; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.expressions.ResidualEvaluator; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; +import org.apache.iceberg.hadoop.HadoopFileIO; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +public class ReaderUtil { + + private ReaderUtil() {} + + public static FileScanTask createFileTask( + List records, + File file, + FileFormat fileFormat, + FileAppenderFactory appenderFactory) + throws IOException { + FileAppender appender = + appenderFactory.newAppender(Files.localOutput(file), fileFormat); + try { + appender.addAll(records); + } finally { + appender.close(); + } + + DataFile dataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(records.size()) + .withFileSizeInBytes(file.length()) + .withPath(file.toString()) + .withFormat(fileFormat) + .withMetrics(appender.metrics()) + .build(); + + ResidualEvaluator residuals = ResidualEvaluator.unpartitioned(Expressions.alwaysTrue()); + return new BaseFileScanTask( + dataFile, + null, + SchemaParser.toJson(TestFixtures.SCHEMA), + PartitionSpecParser.toJson(PartitionSpec.unpartitioned()), + residuals); + } + + public static DataIterator createDataIterator(CombinedScanTask combinedTask) { + return new DataIterator<>( + new RowDataFileScanTaskReader( + TestFixtures.SCHEMA, TestFixtures.SCHEMA, null, true, Collections.emptyList()), + combinedTask, + new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), + PlaintextEncryptionManager.instance()); + } + + public static List> createRecordBatchList( + Schema schema, int listSize, int batchCount) { + return createRecordBatchList(0L, schema, listSize, batchCount); + } + + public static List> createRecordBatchList( + long seed, Schema schema, int listSize, int batchCount) { + List records = RandomGenericData.generate(schema, listSize * batchCount, seed); + return Lists.partition(records, batchCount); + } + + public static CombinedScanTask createCombinedScanTask( + List> recordBatchList, + Path temporaryFolder, + FileFormat fileFormat, + GenericAppenderFactory appenderFactory) + throws IOException { + List fileTasks = Lists.newArrayListWithCapacity(recordBatchList.size()); + for (List recordBatch : recordBatchList) { + FileScanTask fileTask = + ReaderUtil.createFileTask( + recordBatch, + File.createTempFile("junit", null, temporaryFolder.toFile()), + fileFormat, + appenderFactory); + fileTasks.add(fileTask); + } + + return new BaseCombinedScanTask(fileTasks); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java new file mode 100644 index 000000000000..6f09bd9a56d6 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.concurrent.atomic.AtomicBoolean; +import org.junit.jupiter.api.Test; + +public class TestArrayBatchRecords { + + @Test + public void testFullRange() { + String[] elements = new String[] {"0", "1", "2", "3"}; + testArray(elements, elements.length, 2, 119); + } + + @Test + public void testSubRange() { + String[] elements = new String[] {"0", "1", "2", "3"}; + testArray(elements, 2, 0, 0); + } + + private void testArray( + String[] elements, int numberOfRecords, int fileOffset, long startingRecordOffset) { + String splitId = "iceberg_split_1"; + AtomicBoolean recycled = new AtomicBoolean(); + + ArrayBatchRecords recordsWithSplitIds = + ArrayBatchRecords.forRecords( + splitId, + ignored -> recycled.set(true), + elements, + numberOfRecords, + fileOffset, + startingRecordOffset); + + assertThat(recordsWithSplitIds.nextSplit()).isEqualTo(splitId); + + for (int i = 0; i < numberOfRecords; i++) { + RecordAndPosition recAndPos = recordsWithSplitIds.nextRecordFromSplit(); + assertThat(recAndPos.record()).isEqualTo(elements[i]); + assertThat(recAndPos.fileOffset()).isEqualTo(fileOffset); + // recordOffset points to the position after this one + assertThat(recAndPos.recordOffset()).isEqualTo(startingRecordOffset + i + 1); + } + + assertThat(recordsWithSplitIds.nextRecordFromSplit()).isNull(); + assertThat(recordsWithSplitIds.nextSplit()).isNull(); + recordsWithSplitIds.recycle(); + assertThat(recycled.get()).isTrue(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java new file mode 100644 index 000000000000..1a78bb1b0010 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java @@ -0,0 +1,360 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.SourceReaderOptions; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.BaseCombinedScanTask; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.io.CloseableIterator; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestArrayPoolDataIteratorBatcherRowData { + + @TempDir protected Path temporaryFolder; + private static final FileFormat FILE_FORMAT = FileFormat.PARQUET; + private final Configuration config = + new Configuration() + .set(SourceReaderOptions.ELEMENT_QUEUE_CAPACITY, 1) + .set(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 2); + + private final GenericAppenderFactory appenderFactory = + new GenericAppenderFactory(TestFixtures.SCHEMA); + private final DataIteratorBatcher batcher = + new ArrayPoolDataIteratorBatcher<>(config, new RowDataRecordFactory(TestFixtures.ROW_TYPE)); + + /** Read a CombinedScanTask that contains a single file with less than a full batch of records */ + @Test + public void testSingleFileLessThanOneFullBatch() throws Exception { + List records = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1); + FileScanTask fileTask = + ReaderUtil.createFileTask( + records, + File.createTempFile("junit", null, temporaryFolder.toFile()), + FILE_FORMAT, + appenderFactory); + CombinedScanTask combinedTask = new BaseCombinedScanTask(fileTask); + DataIterator dataIterator = ReaderUtil.createDataIterator(combinedTask); + String splitId = "someSplitId"; + CloseableIterator>> recordBatchIterator = + batcher.batch(splitId, dataIterator); + + ArrayBatchRecords batch = (ArrayBatchRecords) recordBatchIterator.next(); + assertThat(batch.finishedSplits()).isEmpty(); + assertThat(batch.nextSplit()).isEqualTo(splitId); + assertThat(batch.records()).hasSize(2); + assertThat(batch.numberOfRecords()).isEqualTo(1); + + RecordAndPosition recordAndPosition = batch.nextRecordFromSplit(); + + /////////////////////////////// + // assert first record + + assertThat(recordAndPosition.fileOffset()).isEqualTo(0); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(1); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(0), recordAndPosition.record()); + + assertThat(batch.nextRecordFromSplit()).isNull(); + assertThat(batch.nextSplit()).isNull(); + batch.recycle(); + + assertThat(recordBatchIterator).isExhausted(); + } + + /** + * Read a CombinedScanTask that contains a single file with multiple batches. + * + *

    Insert 5 records in a single file that should result in 3 batches + */ + @Test + public void testSingleFileWithMultipleBatches() throws Exception { + List records = RandomGenericData.generate(TestFixtures.SCHEMA, 5, 1); + FileScanTask fileTask = + ReaderUtil.createFileTask( + records, + File.createTempFile("junit", null, temporaryFolder.toFile()), + FILE_FORMAT, + appenderFactory); + CombinedScanTask combinedTask = new BaseCombinedScanTask(fileTask); + DataIterator dataIterator = ReaderUtil.createDataIterator(combinedTask); + String splitId = "someSplitId"; + CloseableIterator>> recordBatchIterator = + batcher.batch(splitId, dataIterator); + + /////////////////////////////// + // assert first batch with full batch of 2 records + + ArrayBatchRecords batch0 = (ArrayBatchRecords) recordBatchIterator.next(); + assertThat(batch0.finishedSplits()).isEmpty(); + assertThat(batch0.nextSplit()).isEqualTo(splitId); + assertThat(batch0.records()).hasSize(2); + assertThat(batch0.numberOfRecords()).isEqualTo(2); + + RecordAndPosition recordAndPosition; + + // assert first record + recordAndPosition = batch0.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(0); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(1); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(0), recordAndPosition.record()); + + // assert second record + recordAndPosition = batch0.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(0); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(2); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(1), recordAndPosition.record()); + + assertThat(batch0.nextRecordFromSplit()).isNull(); + assertThat(batch0.nextSplit()).isNull(); + batch0.recycle(); + + /////////////////////////////// + // assert second batch with full batch of 2 records + + ArrayBatchRecords batch1 = (ArrayBatchRecords) recordBatchIterator.next(); + assertThat(batch1.records()).containsExactlyInAnyOrder(batch0.records()); + assertThat(batch1.finishedSplits()).isEmpty(); + assertThat(batch1.nextSplit()).isEqualTo(splitId); + assertThat(batch1.records()).hasSize(2); + assertThat(batch1.numberOfRecords()).isEqualTo(2); + + // assert third record + recordAndPosition = batch1.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(0); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(3); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(2), recordAndPosition.record()); + + // assert fourth record + recordAndPosition = batch1.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(0); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(4); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(3), recordAndPosition.record()); + + assertThat(batch1.nextRecordFromSplit()).isNull(); + assertThat(batch1.nextSplit()).isNull(); + batch1.recycle(); + + /////////////////////////////// + // assert third batch with partial batch of 1 record + + ArrayBatchRecords batch2 = (ArrayBatchRecords) recordBatchIterator.next(); + assertThat(batch2.records()).containsExactlyInAnyOrder(batch0.records()); + assertThat(batch2.finishedSplits()).isEmpty(); + assertThat(batch2.nextSplit()).isEqualTo(splitId); + assertThat(batch2.records()).hasSize(2); + assertThat(batch2.numberOfRecords()).isEqualTo(1); + + // assert fifth record + recordAndPosition = batch2.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(0); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(5); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(4), recordAndPosition.record()); + + assertThat(batch2.nextRecordFromSplit()).isNull(); + assertThat(batch2.nextSplit()).isNull(); + batch2.recycle(); + + assertThat(recordBatchIterator).isExhausted(); + } + + /** + * Read a CombinedScanTask that contains with multiple files. + * + *

    In this test, we also seek the iterator to starting position (1, 1). + */ + @Test + public void testMultipleFilesWithSeekPosition() throws Exception { + List records0 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1); + FileScanTask fileTask0 = + ReaderUtil.createFileTask( + records0, + File.createTempFile("junit", null, temporaryFolder.toFile()), + FILE_FORMAT, + appenderFactory); + List records1 = RandomGenericData.generate(TestFixtures.SCHEMA, 4, 2); + FileScanTask fileTask1 = + ReaderUtil.createFileTask( + records1, + File.createTempFile("junit", null, temporaryFolder.toFile()), + FILE_FORMAT, + appenderFactory); + List records2 = RandomGenericData.generate(TestFixtures.SCHEMA, 3, 3); + FileScanTask fileTask2 = + ReaderUtil.createFileTask( + records2, + File.createTempFile("junit", null, temporaryFolder.toFile()), + FILE_FORMAT, + appenderFactory); + CombinedScanTask combinedTask = + new BaseCombinedScanTask(Arrays.asList(fileTask0, fileTask1, fileTask2)); + + DataIterator dataIterator = ReaderUtil.createDataIterator(combinedTask); + dataIterator.seek(1, 1); + + String splitId = "someSplitId"; + CloseableIterator>> recordBatchIterator = + batcher.batch(splitId, dataIterator); + + /////////////////////////////// + // file0 is skipped by seek + + /////////////////////////////// + // file1 has 4 records. because the seek position, first record is skipped. + // we should read 3 remaining records in 2 batches: + // batch10 with 2 records and batch11 with 1 records. + + // assert first batch from file1 with full batch of 2 records + + // variable naming convention: batch + ArrayBatchRecords batch10 = (ArrayBatchRecords) recordBatchIterator.next(); + assertThat(batch10.finishedSplits()).isEmpty(); + assertThat(batch10.nextSplit()).isEqualTo(splitId); + assertThat(batch10.records()).hasSize(2); + assertThat(batch10.numberOfRecords()).isEqualTo(2); + + RecordAndPosition recordAndPosition; + + recordAndPosition = batch10.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(1); + assertThat(recordAndPosition.recordOffset()) + .as("seek should skip the first record in file1. starting from the second record") + .isEqualTo(2); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records1.get(1), recordAndPosition.record()); + + recordAndPosition = batch10.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(1); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(3); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records1.get(2), recordAndPosition.record()); + + assertThat(batch10.nextRecordFromSplit()).isNull(); + assertThat(batch10.nextSplit()).isNull(); + batch10.recycle(); + + // assert second batch from file1 with partial batch of 1 record + + // variable naming convention: batch__ + ArrayBatchRecords batch11 = (ArrayBatchRecords) recordBatchIterator.next(); + assertThat(batch11.records()).containsExactlyInAnyOrder(batch10.records()); + assertThat(batch11.finishedSplits()).isEmpty(); + assertThat(batch11.nextSplit()).isEqualTo(splitId); + assertThat(batch11.records()).hasSize(2); + assertThat(batch11.numberOfRecords()).isEqualTo(1); + + recordAndPosition = batch11.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(1); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(4); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records1.get(3), recordAndPosition.record()); + + assertThat(batch11.nextRecordFromSplit()).isNull(); + assertThat(batch11.nextSplit()).isNull(); + batch11.recycle(); + + /////////////////////////////// + // file2 has 3 records. + // we should read 3 records in 2 batches: + // batch20 with 2 records and batch21 with 1 records + + // assert first batch from file2 with full batch of 2 records + + // variable naming convention: batch__ + ArrayBatchRecords batch20 = (ArrayBatchRecords) recordBatchIterator.next(); + assertThat(batch20.records()).containsExactlyInAnyOrder(batch10.records()); + assertThat(batch20.finishedSplits()).isEmpty(); + assertThat(batch20.nextSplit()).isEqualTo(splitId); + assertThat(batch20.records()).hasSize(2); + assertThat(batch20.numberOfRecords()).isEqualTo(2); + + recordAndPosition = batch20.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(2); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(1); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records2.get(0), recordAndPosition.record()); + + recordAndPosition = batch20.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(2); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(2); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records2.get(1), recordAndPosition.record()); + + assertThat(batch20.nextRecordFromSplit()).isNull(); + assertThat(batch20.nextSplit()).isNull(); + batch20.recycle(); + + /////////////////////////////// + // assert second batch from file2 with partial batch of 1 record + + // variable naming convention: batch__ + ArrayBatchRecords batch21 = (ArrayBatchRecords) recordBatchIterator.next(); + assertThat(batch21.records()).containsExactlyInAnyOrder(batch10.records()); + assertThat(batch21.finishedSplits()).isEmpty(); + assertThat(batch21.nextSplit()).isEqualTo(splitId); + assertThat(batch21.records()).hasSize(2); + assertThat(batch21.numberOfRecords()).isEqualTo(1); + + recordAndPosition = batch21.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(2); + + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(3); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records2.get(2), recordAndPosition.record()); + + assertThat(batch21.nextRecordFromSplit()).isNull(); + assertThat(batch21.nextSplit()).isNull(); + batch21.recycle(); + + assertThat(recordBatchIterator).isExhausted(); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java new file mode 100644 index 000000000000..af806d4c655d --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.apache.iceberg.flink.TestFixtures.DATABASE; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.io.IOException; +import java.nio.file.Path; +import java.time.LocalDateTime; +import java.time.OffsetDateTime; +import java.time.ZoneOffset; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.HadoopTableExtension; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestColumnStatsWatermarkExtractor { + public static final Schema SCHEMA = + new Schema( + required(1, "timestamp_column", Types.TimestampType.withoutZone()), + required(2, "timestamptz_column", Types.TimestampType.withZone()), + required(3, "long_column", Types.LongType.get()), + required(4, "string_column", Types.StringType.get())); + + private static final GenericAppenderFactory APPENDER_FACTORY = new GenericAppenderFactory(SCHEMA); + + private static final List> TEST_RECORDS = + ImmutableList.of( + RandomGenericData.generate(SCHEMA, 3, 2L), RandomGenericData.generate(SCHEMA, 3, 19L)); + + private static final List> MIN_VALUES = + ImmutableList.of(Maps.newHashMapWithExpectedSize(3), Maps.newHashMapWithExpectedSize(3)); + + @TempDir protected Path temporaryFolder; + + @RegisterExtension + private static final HadoopTableExtension SOURCE_TABLE_EXTENSION = + new HadoopTableExtension(DATABASE, TestFixtures.TABLE, SCHEMA); + + @Parameter(index = 0) + private String columnName; + + @BeforeAll + public static void updateMinValue() { + for (int i = 0; i < TEST_RECORDS.size(); ++i) { + for (Record r : TEST_RECORDS.get(i)) { + Map minValues = MIN_VALUES.get(i); + + LocalDateTime localDateTime = (LocalDateTime) r.get(0); + minValues.merge( + "timestamp_column", localDateTime.toInstant(ZoneOffset.UTC).toEpochMilli(), Math::min); + + OffsetDateTime offsetDateTime = (OffsetDateTime) r.get(1); + minValues.merge("timestamptz_column", offsetDateTime.toInstant().toEpochMilli(), Math::min); + + minValues.merge("long_column", (Long) r.get(2), Math::min); + } + } + } + + @Parameters(name = "columnName = {0}") + public static Collection data() { + return ImmutableList.of( + new Object[] {"timestamp_column"}, + new Object[] {"timestamptz_column"}, + new Object[] {"long_column"}); + } + + @TestTemplate + public void testSingle() throws IOException { + ColumnStatsWatermarkExtractor extractor = + new ColumnStatsWatermarkExtractor(SCHEMA, columnName, TimeUnit.MILLISECONDS); + + assertThat(extractor.extractWatermark(split(0))) + .isEqualTo(MIN_VALUES.get(0).get(columnName).longValue()); + } + + @TestTemplate + public void testTimeUnit() throws IOException { + assumeThat(columnName).isEqualTo("long_column"); + ColumnStatsWatermarkExtractor extractor = + new ColumnStatsWatermarkExtractor(SCHEMA, columnName, TimeUnit.MICROSECONDS); + + assertThat(extractor.extractWatermark(split(0))) + .isEqualTo(MIN_VALUES.get(0).get(columnName) / 1000L); + } + + @TestTemplate + public void testMultipleFiles() throws IOException { + assumeThat(columnName).isEqualTo("timestamp_column"); + IcebergSourceSplit combinedSplit = + IcebergSourceSplit.fromCombinedScanTask( + ReaderUtil.createCombinedScanTask( + TEST_RECORDS, temporaryFolder, FileFormat.PARQUET, APPENDER_FACTORY)); + + ColumnStatsWatermarkExtractor extractor = + new ColumnStatsWatermarkExtractor(SCHEMA, columnName, null); + + assertThat(extractor.extractWatermark(split(0))) + .isEqualTo(MIN_VALUES.get(0).get(columnName).longValue()); + assertThat(extractor.extractWatermark(split(1))) + .isEqualTo(MIN_VALUES.get(1).get(columnName).longValue()); + assertThat(extractor.extractWatermark(combinedSplit)) + .isEqualTo(Math.min(MIN_VALUES.get(0).get(columnName), MIN_VALUES.get(1).get(columnName))); + } + + @TestTemplate + public void testWrongColumn() { + assumeThat(columnName).isEqualTo("string_column"); + assertThatThrownBy(() -> new ColumnStatsWatermarkExtractor(SCHEMA, columnName, null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining( + "Found STRING, expected a LONG or TIMESTAMP column for watermark generation."); + } + + @TestTemplate + public void testEmptyStatistics() throws IOException { + assumeThat(columnName).isEqualTo("timestamp_column"); + + // Create an extractor for a column we do not have statistics + ColumnStatsWatermarkExtractor extractor = + new ColumnStatsWatermarkExtractor(10, "missing_field"); + assertThatThrownBy(() -> extractor.extractWatermark(split(0))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Missing statistics for column"); + } + + private IcebergSourceSplit split(int id) throws IOException { + return IcebergSourceSplit.fromCombinedScanTask( + ReaderUtil.createCombinedScanTask( + ImmutableList.of(TEST_RECORDS.get(id)), + temporaryFolder, + FileFormat.PARQUET, + APPENDER_FACTORY)); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java new file mode 100644 index 000000000000..8d6782586676 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.connector.testutils.source.reader.TestingReaderContext; +import org.apache.flink.connector.testutils.source.reader.TestingReaderOutput; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.encryption.PlaintextEncryptionManager; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.SerializableComparator; +import org.apache.iceberg.hadoop.HadoopFileIO; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestIcebergSourceReader { + @TempDir protected Path temporaryFolder; + + private final GenericAppenderFactory appenderFactory = + new GenericAppenderFactory(TestFixtures.SCHEMA); + + @Test + public void testReaderMetrics() throws Exception { + TestingReaderOutput readerOutput = new TestingReaderOutput<>(); + TestingMetricGroup metricGroup = new TestingMetricGroup(); + TestingReaderContext readerContext = new TestingReaderContext(new Configuration(), metricGroup); + IcebergSourceReader reader = createReader(metricGroup, readerContext, null); + reader.start(); + + testOneSplitFetcher(reader, readerOutput, metricGroup, 1); + testOneSplitFetcher(reader, readerOutput, metricGroup, 2); + } + + @Test + public void testReaderOrder() throws Exception { + // Create 2 splits + List> recordBatchList1 = + ReaderUtil.createRecordBatchList(0L, TestFixtures.SCHEMA, 1, 1); + CombinedScanTask task1 = + ReaderUtil.createCombinedScanTask( + recordBatchList1, temporaryFolder, FileFormat.PARQUET, appenderFactory); + + List> recordBatchList2 = + ReaderUtil.createRecordBatchList(1L, TestFixtures.SCHEMA, 1, 1); + CombinedScanTask task2 = + ReaderUtil.createCombinedScanTask( + recordBatchList2, temporaryFolder, FileFormat.PARQUET, appenderFactory); + + // Sort the splits in one way + List rowDataList1 = + read( + Arrays.asList( + IcebergSourceSplit.fromCombinedScanTask(task1), + IcebergSourceSplit.fromCombinedScanTask(task2)), + 2); + + // Reverse the splits + List rowDataList2 = + read( + Arrays.asList( + IcebergSourceSplit.fromCombinedScanTask(task2), + IcebergSourceSplit.fromCombinedScanTask(task1)), + 2); + + // Check that the order of the elements is not changed + assertThat(rowDataList1).containsExactlyElementsOf(rowDataList2); + } + + private List read(List splits, long expected) throws Exception { + TestingMetricGroup metricGroup = new TestingMetricGroup(); + TestingReaderContext readerContext = new TestingReaderContext(new Configuration(), metricGroup); + // Using IdBasedComparator, so we can have a deterministic order of the splits + IcebergSourceReader reader = createReader(metricGroup, readerContext, new IdBasedComparator()); + reader.start(); + + reader.addSplits(splits); + TestingReaderOutput readerOutput = new TestingReaderOutput<>(); + while (readerOutput.getEmittedRecords().size() < expected) { + reader.pollNext(readerOutput); + } + + reader.pollNext(readerOutput); + + assertThat(readerOutput.getEmittedRecords()).hasSize((int) expected); + return readerOutput.getEmittedRecords(); + } + + private void testOneSplitFetcher( + IcebergSourceReader reader, + TestingReaderOutput readerOutput, + TestingMetricGroup metricGroup, + int expectedCount) + throws Exception { + long seed = expectedCount; + // Each split should contain only one file with one record + List> recordBatchList = + ReaderUtil.createRecordBatchList(seed, TestFixtures.SCHEMA, 1, 1); + CombinedScanTask task = + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, FileFormat.PARQUET, appenderFactory); + IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(task); + reader.addSplits(Collections.singletonList(split)); + + while (readerOutput.getEmittedRecords().size() < expectedCount) { + reader.pollNext(readerOutput); + } + + assertThat(readerOutput.getEmittedRecords()).hasSize(expectedCount); + TestHelpers.assertRowData( + TestFixtures.SCHEMA, + recordBatchList.get(0).get(0), + readerOutput.getEmittedRecords().get(expectedCount - 1)); + assertThat(metricGroup.counters().get("assignedSplits").getCount()).isEqualTo(expectedCount); + + // One more poll will get null record batch. + // That will finish the split and cause split fetcher to be closed due to idleness. + // Then next split will create a new split reader. + reader.pollNext(readerOutput); + } + + private IcebergSourceReader createReader( + MetricGroup metricGroup, + SourceReaderContext readerContext, + SerializableComparator splitComparator) { + IcebergSourceReaderMetrics readerMetrics = + new IcebergSourceReaderMetrics(metricGroup, "db.tbl"); + RowDataReaderFunction readerFunction = + new RowDataReaderFunction( + new Configuration(), + TestFixtures.SCHEMA, + TestFixtures.SCHEMA, + null, + true, + new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), + PlaintextEncryptionManager.instance(), + Collections.emptyList()); + return new IcebergSourceReader<>( + SerializableRecordEmitter.defaultEmitter(), + readerMetrics, + readerFunction, + splitComparator, + readerContext); + } + + private static class IdBasedComparator implements SerializableComparator { + @Override + public int compare(IcebergSourceSplit o1, IcebergSourceSplit o2) { + return o1.splitId().compareTo(o2.splitId()); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java new file mode 100644 index 000000000000..36749d3ec2dc --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.file.Path; +import java.util.Collections; +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.encryption.PlaintextEncryptionManager; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; +import org.apache.iceberg.hadoop.HadoopFileIO; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +public class TestLimitableDataIterator { + @TempDir private static Path temporaryFolder; + + private final RowDataFileScanTaskReader reader = + new RowDataFileScanTaskReader( + TestFixtures.SCHEMA, TestFixtures.SCHEMA, null, true, Collections.emptyList()); + private final HadoopFileIO fileIO = new HadoopFileIO(new org.apache.hadoop.conf.Configuration()); + private final EncryptionManager encryptionManager = PlaintextEncryptionManager.instance(); + + private static CombinedScanTask combinedScanTask; + private static int totalRecords; + + @BeforeAll + public static void beforeClass() throws Exception { + GenericAppenderFactory appenderFactory = new GenericAppenderFactory(TestFixtures.SCHEMA); + List> recordBatchList = + ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); + combinedScanTask = + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, FileFormat.PARQUET, appenderFactory); + totalRecords = 3 * 2; + } + + @ParameterizedTest + @ValueSource(longs = {-1L, 0L, 1L, 6L, 7L}) + public void testUnlimited(long limit) { + LimitableDataIterator dataIterator = + new LimitableDataIterator<>( + reader, combinedScanTask, fileIO, encryptionManager, RecordLimiter.create(limit)); + + List result = Lists.newArrayList(); + while (dataIterator.hasNext()) { + result.add(dataIterator.next()); + } + + if (limit <= 0 || limit > totalRecords) { + // read all records + assertThat(result).hasSize(totalRecords); + } else { + assertThat(result).hasSize((int) limit); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java new file mode 100644 index 000000000000..55f9c0af3a29 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.conversion.DataStructureConverter; +import org.apache.flink.table.data.conversion.DataStructureConverters; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.flink.types.Row; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.encryption.PlaintextEncryptionManager; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.hadoop.HadoopFileIO; + +public class TestRowDataReaderFunction extends ReaderFunctionTestBase { + + protected static final RowType ROW_TYPE = FlinkSchemaUtil.convert(TestFixtures.SCHEMA); + private static final DataStructureConverter ROW_DATA_CONVERTER = + DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(ROW_TYPE)); + + @Override + protected ReaderFunction readerFunction() { + return new RowDataReaderFunction( + new Configuration(), + TestFixtures.SCHEMA, + TestFixtures.SCHEMA, + null, + true, + new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), + PlaintextEncryptionManager.instance(), + Collections.emptyList()); + } + + @Override + protected void assertRecords(List expected, List actual, Schema schema) { + List rows = toRows(actual); + TestHelpers.assertRecords(rows, expected, TestFixtures.SCHEMA); + } + + private List toRows(List actual) { + return actual.stream() + .map(rowData -> (Row) ROW_DATA_CONVERTER.toExternal(rowData)) + .collect(Collectors.toList()); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java new file mode 100644 index 000000000000..290628c5fc90 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.Map; +import org.apache.flink.metrics.Counter; +import org.apache.flink.metrics.Gauge; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.metrics.SimpleCounter; +import org.apache.flink.metrics.groups.OperatorIOMetricGroup; +import org.apache.flink.metrics.groups.SourceReaderMetricGroup; +import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +class TestingMetricGroup extends UnregisteredMetricsGroup implements SourceReaderMetricGroup { + private final Map counters; + + TestingMetricGroup() { + this.counters = Maps.newHashMap(); + } + + /** Pass along the reference to share the map for child metric groups. */ + private TestingMetricGroup(Map counters) { + this.counters = counters; + } + + Map counters() { + return counters; + } + + @Override + public Counter counter(String name) { + Counter counter = new SimpleCounter(); + counters.put(name, counter); + return counter; + } + + @Override + public MetricGroup addGroup(String name) { + return new TestingMetricGroup(counters); + } + + @Override + public MetricGroup addGroup(String key, String value) { + return new TestingMetricGroup(counters); + } + + @Override + public OperatorIOMetricGroup getIOMetricGroup() { + return new TestingOperatorIOMetricGroup(); + } + + @Override + public Counter getNumRecordsInErrorsCounter() { + return new SimpleCounter(); + } + + @Override + public void setPendingBytesGauge(Gauge pendingBytesGauge) {} + + @Override + public void setPendingRecordsGauge(Gauge pendingRecordsGauge) {} + + private static class TestingOperatorIOMetricGroup extends UnregisteredMetricsGroup + implements OperatorIOMetricGroup { + @Override + public Counter getNumRecordsInCounter() { + return new SimpleCounter(); + } + + @Override + public Counter getNumRecordsOutCounter() { + return new SimpleCounter(); + } + + @Override + public Counter getNumBytesInCounter() { + return new SimpleCounter(); + } + + @Override + public Counter getNumBytesOutCounter() { + return new SimpleCounter(); + } + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java new file mode 100644 index 000000000000..4a21f451e1e5 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.split; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.file.Path; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.flink.source.SplitHelpers; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestIcebergSourceSplitSerializer { + + @TempDir protected Path temporaryFolder; + + private final IcebergSourceSplitSerializer serializer = new IcebergSourceSplitSerializer(true); + + @Test + public void testLatestVersion() throws Exception { + serializeAndDeserialize(1, 1); + serializeAndDeserialize(10, 2); + } + + private void serializeAndDeserialize(int splitCount, int filesPerSplit) throws Exception { + final List splits = + SplitHelpers.createSplitsFromTransientHadoopTable( + temporaryFolder, splitCount, filesPerSplit); + for (IcebergSourceSplit split : splits) { + byte[] result = serializer.serialize(split); + IcebergSourceSplit deserialized = serializer.deserialize(serializer.getVersion(), result); + assertSplitEquals(split, deserialized); + + byte[] cachedResult = serializer.serialize(split); + assertThat(cachedResult).isSameAs(result); + IcebergSourceSplit deserialized2 = + serializer.deserialize(serializer.getVersion(), cachedResult); + assertSplitEquals(split, deserialized2); + + split.updatePosition(0, 100); + byte[] resultAfterUpdatePosition = serializer.serialize(split); + // after position change, serialized bytes should have changed + assertThat(resultAfterUpdatePosition).isNotSameAs(cachedResult); + IcebergSourceSplit deserialized3 = + serializer.deserialize(serializer.getVersion(), resultAfterUpdatePosition); + assertSplitEquals(split, deserialized3); + } + } + + @Test + public void testV1() throws Exception { + serializeAndDeserializeV1(1, 1); + serializeAndDeserializeV1(10, 2); + } + + private void serializeAndDeserializeV1(int splitCount, int filesPerSplit) throws Exception { + final List splits = + SplitHelpers.createSplitsFromTransientHadoopTable( + temporaryFolder, splitCount, filesPerSplit); + for (IcebergSourceSplit split : splits) { + byte[] result = split.serializeV1(); + IcebergSourceSplit deserialized = IcebergSourceSplit.deserializeV1(result); + assertSplitEquals(split, deserialized); + } + } + + @Test + public void testV2() throws Exception { + serializeAndDeserializeV2(1, 1); + serializeAndDeserializeV2(10, 2); + } + + private void serializeAndDeserializeV2(int splitCount, int filesPerSplit) throws Exception { + final List splits = + SplitHelpers.createSplitsFromTransientHadoopTable( + temporaryFolder, splitCount, filesPerSplit); + for (IcebergSourceSplit split : splits) { + byte[] result = split.serializeV2(); + IcebergSourceSplit deserialized = IcebergSourceSplit.deserializeV2(result, true); + assertSplitEquals(split, deserialized); + } + } + + @Test + public void testV3WithTooManyDeleteFiles() throws Exception { + serializeAndDeserializeV3(1, 1, 5000); + } + + private void serializeAndDeserializeV3(int splitCount, int filesPerSplit, int mockDeletesPerSplit) + throws Exception { + final List splits = + SplitHelpers.createSplitsFromTransientHadoopTable( + temporaryFolder, splitCount, filesPerSplit); + final List splitsWithMockDeleteFiles = + SplitHelpers.equipSplitsWithMockDeleteFiles(splits, temporaryFolder, mockDeletesPerSplit); + + for (IcebergSourceSplit split : splitsWithMockDeleteFiles) { + byte[] result = split.serializeV3(); + IcebergSourceSplit deserialized = IcebergSourceSplit.deserializeV3(result, true); + assertSplitEquals(split, deserialized); + } + } + + @Test + public void testDeserializeV1() throws Exception { + final List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); + for (IcebergSourceSplit split : splits) { + byte[] result = split.serializeV1(); + IcebergSourceSplit deserialized = serializer.deserialize(1, result); + assertSplitEquals(split, deserialized); + } + } + + @Test + public void testCheckpointedPosition() throws Exception { + final AtomicInteger index = new AtomicInteger(); + final List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 10, 2).stream() + .map( + split -> { + IcebergSourceSplit result; + if (index.get() % 2 == 0) { + result = IcebergSourceSplit.fromCombinedScanTask(split.task(), 1, 1); + } else { + result = split; + } + index.incrementAndGet(); + return result; + }) + .collect(Collectors.toList()); + + for (IcebergSourceSplit split : splits) { + byte[] result = serializer.serialize(split); + IcebergSourceSplit deserialized = serializer.deserialize(serializer.getVersion(), result); + assertSplitEquals(split, deserialized); + + byte[] cachedResult = serializer.serialize(split); + assertThat(cachedResult).isSameAs(result); + IcebergSourceSplit deserialized2 = + serializer.deserialize(serializer.getVersion(), cachedResult); + assertSplitEquals(split, deserialized2); + } + } + + private void assertSplitEquals(IcebergSourceSplit expected, IcebergSourceSplit actual) { + List expectedTasks = Lists.newArrayList(expected.task().tasks().iterator()); + List actualTasks = Lists.newArrayList(actual.task().tasks().iterator()); + assertThat(actualTasks).hasSameSizeAs(expectedTasks); + for (int i = 0; i < expectedTasks.size(); ++i) { + FileScanTask expectedTask = expectedTasks.get(i); + FileScanTask actualTask = actualTasks.get(i); + assertThat(actualTask.file().location()).isEqualTo(expectedTask.file().location()); + assertThat(actualTask.sizeBytes()).isEqualTo(expectedTask.sizeBytes()); + assertThat(actualTask.filesCount()).isEqualTo(expectedTask.filesCount()); + assertThat(actualTask.start()).isEqualTo(expectedTask.start()); + assertThat(actualTask.length()).isEqualTo(expectedTask.length()); + } + + assertThat(actual.fileOffset()).isEqualTo(expected.fileOffset()); + assertThat(actual.recordOffset()).isEqualTo(expected.recordOffset()); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java new file mode 100644 index 000000000000..e8054276fa44 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.util; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.junit.jupiter.api.Test; +import org.mockito.MockedStatic; +import org.mockito.Mockito; + +public class TestFlinkPackage { + + /** This unit test would need to be adjusted as new Flink version is supported. */ + @Test + public void testVersion() { + assertThat(FlinkPackage.version()).isEqualTo("2.0.0"); + } + + @Test + public void testDefaultVersion() { + // It's difficult to reproduce a reflection error in a unit test, so we just inject a mocked + // fault to test the default logic + + // First make sure we're not caching a version result from a previous test + FlinkPackage.setVersion(null); + try (MockedStatic mockedStatic = Mockito.mockStatic(FlinkPackage.class)) { + mockedStatic.when(FlinkPackage::versionFromJar).thenThrow(RuntimeException.class); + mockedStatic.when(FlinkPackage::version).thenCallRealMethod(); + assertThat(FlinkPackage.version()).isEqualTo(FlinkPackage.FLINK_UNKNOWN_VERSION); + } + FlinkPackage.setVersion(null); + try (MockedStatic mockedStatic = Mockito.mockStatic(FlinkPackage.class)) { + mockedStatic.when(FlinkPackage::versionFromJar).thenReturn(null); + mockedStatic.when(FlinkPackage::version).thenCallRealMethod(); + FlinkPackage.setVersion(null); + assertThat(FlinkPackage.version()).isEqualTo(FlinkPackage.FLINK_UNKNOWN_VERSION); + } + } +} diff --git a/flink/v2.0/flink/src/test/resources/META-INF/services/org.apache.flink.metrics.reporter.MetricReporterFactory b/flink/v2.0/flink/src/test/resources/META-INF/services/org.apache.flink.metrics.reporter.MetricReporterFactory new file mode 100644 index 000000000000..952255a52b7c --- /dev/null +++ b/flink/v2.0/flink/src/test/resources/META-INF/services/org.apache.flink.metrics.reporter.MetricReporterFactory @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.iceberg.flink.maintenance.operator.MetricsReporterFactoryForTests diff --git a/flink/v2.0/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory b/flink/v2.0/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory new file mode 100644 index 000000000000..47a3c94aa991 --- /dev/null +++ b/flink/v2.0/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.iceberg.flink.source.BoundedTableFactory From ee62f8381808a366de87a05bea4a99b60fd72901 Mon Sep 17 00:00:00 2001 From: Maximilian Michels Date: Thu, 31 Jul 2025 16:14:20 +0200 Subject: [PATCH 3/5] Flink: Adjust build scripts for Flink 2.1 --- flink/build.gradle | 4 ++++ flink/v2.1/build.gradle | 36 ++++++++++++++++++------------------ gradle.properties | 4 ++-- gradle/libs.versions.toml | 12 ++++++++++++ settings.gradle | 9 +++++++++ 5 files changed, 45 insertions(+), 20 deletions(-) diff --git a/flink/build.gradle b/flink/build.gradle index e315f2558d95..86509c3b17d4 100644 --- a/flink/build.gradle +++ b/flink/build.gradle @@ -29,4 +29,8 @@ if (flinkVersions.contains("1.20")) { if (flinkVersions.contains("2.0")) { apply from: file("$projectDir/v2.0/build.gradle") +} + +if (flinkVersions.contains("2.1")) { + apply from: file("$projectDir/v2.1/build.gradle") } \ No newline at end of file diff --git a/flink/v2.1/build.gradle b/flink/v2.1/build.gradle index dfbaa8ff4184..f24eb22e79e8 100644 --- a/flink/v2.1/build.gradle +++ b/flink/v2.1/build.gradle @@ -17,7 +17,7 @@ * under the License. */ -String flinkMajorVersion = '2.0' +String flinkMajorVersion = '2.1' String scalaVersion = System.getProperty("scalaVersion") != null ? System.getProperty("scalaVersion") : System.getProperty("defaultScalaVersion") project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { @@ -32,15 +32,15 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { implementation project(':iceberg-parquet') implementation project(':iceberg-hive-metastore') - compileOnly libs.flink20.avro + compileOnly libs.flink21.avro // for dropwizard histogram metrics implementation - compileOnly libs.flink20.metrics.dropwizard - compileOnly libs.flink20.streaming.java - compileOnly "${libs.flink20.streaming.java.get().module}:${libs.flink20.streaming.java.get().getVersion()}:tests" - compileOnly libs.flink20.table.api.java.bridge - compileOnly "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink20.get()}" - compileOnly libs.flink20.connector.base - compileOnly libs.flink20.connector.files + compileOnly libs.flink21.metrics.dropwizard + compileOnly libs.flink21.streaming.java + compileOnly "${libs.flink21.streaming.java.get().module}:${libs.flink21.streaming.java.get().getVersion()}:tests" + compileOnly libs.flink21.table.api.java.bridge + compileOnly "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink21.get()}" + compileOnly libs.flink21.connector.base + compileOnly libs.flink21.connector.files compileOnly libs.hadoop3.hdfs compileOnly libs.hadoop3.common @@ -71,13 +71,13 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { // for caching in DynamicSink implementation libs.caffeine - testImplementation libs.flink20.connector.test.utils - testImplementation libs.flink20.core - testImplementation libs.flink20.runtime - testImplementation(libs.flink20.test.utilsjunit) { + testImplementation libs.flink21.connector.test.utils + testImplementation libs.flink21.core + testImplementation libs.flink21.runtime + testImplementation(libs.flink21.test.utilsjunit) { exclude group: 'junit' } - testImplementation(libs.flink20.test.utils) { + testImplementation(libs.flink21.test.utils) { exclude group: "org.apache.curator", module: 'curator-test' exclude group: 'junit' } @@ -172,7 +172,7 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { } // for dropwizard histogram metrics implementation - implementation libs.flink20.metrics.dropwizard + implementation libs.flink21.metrics.dropwizard // for integration testing with the flink-runtime-jar // all of those dependencies are required because the integration test extends FlinkTestBase @@ -181,13 +181,13 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { integrationImplementation project(path: ":iceberg-flink:iceberg-flink-${flinkMajorVersion}", configuration: "testArtifacts") integrationImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') integrationImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') - integrationImplementation(libs.flink20.test.utils) { + integrationImplementation(libs.flink21.test.utils) { exclude group: "org.apache.curator", module: 'curator-test' exclude group: 'junit' } - integrationImplementation libs.flink20.table.api.java.bridge - integrationImplementation "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink20.get()}" + integrationImplementation libs.flink21.table.api.java.bridge + integrationImplementation "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink21.get()}" integrationImplementation libs.hadoop3.common integrationImplementation libs.hadoop3.hdfs diff --git a/gradle.properties b/gradle.properties index 5da56c59de41..80a00f8cedef 100644 --- a/gradle.properties +++ b/gradle.properties @@ -16,8 +16,8 @@ jmhOutputPath=build/reports/jmh/human-readable-output.txt jmhJsonOutputPath=build/reports/jmh/results.json jmhIncludeRegex=.* -systemProp.defaultFlinkVersions=2.0 -systemProp.knownFlinkVersions=1.19,1.20,2.0 +systemProp.defaultFlinkVersions=2.1 +systemProp.knownFlinkVersions=1.19,1.20,2.0,2.1 systemProp.defaultSparkVersions=4.0 systemProp.knownSparkVersions=3.4,3.5,4.0 systemProp.defaultKafkaVersions=3 diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index c18d33476f61..ea387e04404a 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -49,6 +49,7 @@ findbugs-jsr305 = "3.0.2" flink119 = { strictly = "1.19.2"} flink120 = { strictly = "1.20.1"} flink20 = { strictly = "2.0.0"} +flink21 = { strictly = "2.1.0"} google-libraries-bom = "26.67.0" guava = "33.4.8-jre" hadoop3 = "3.4.1" @@ -131,6 +132,12 @@ flink20-connector-files = { module = "org.apache.flink:flink-connector-files", v flink20-metrics-dropwizard = { module = "org.apache.flink:flink-metrics-dropwizard", version.ref = "flink20" } flink20-streaming-java = { module = "org.apache.flink:flink-streaming-java", version.ref = "flink20" } flink20-table-api-java-bridge = { module = "org.apache.flink:flink-table-api-java-bridge", version.ref = "flink20" } +flink21-avro = { module = "org.apache.flink:flink-avro", version.ref = "flink21" } +flink21-connector-base = { module = "org.apache.flink:flink-connector-base", version.ref = "flink21" } +flink21-connector-files = { module = "org.apache.flink:flink-connector-files", version.ref = "flink21" } +flink21-metrics-dropwizard = { module = "org.apache.flink:flink-metrics-dropwizard", version.ref = "flink21" } +flink21-streaming-java = { module = "org.apache.flink:flink-streaming-java", version.ref = "flink21" } +flink21-table-api-java-bridge = { module = "org.apache.flink:flink-table-api-java-bridge", version.ref = "flink21" } google-libraries-bom = { module = "com.google.cloud:libraries-bom", version.ref = "google-libraries-bom" } guava-guava = { module = "com.google.guava:guava", version.ref = "guava" } hadoop3-client = { module = "org.apache.hadoop:hadoop-client", version.ref = "hadoop3" } @@ -195,6 +202,11 @@ flink20-core = { module = "org.apache.flink:flink-core", version.ref = "flink20" flink20-runtime = { module = "org.apache.flink:flink-runtime", version.ref = "flink20" } flink20-test-utils = { module = "org.apache.flink:flink-test-utils", version.ref = "flink20" } flink20-test-utilsjunit = { module = "org.apache.flink:flink-test-utils-junit", version.ref = "flink20" } +flink21-connector-test-utils = { module = "org.apache.flink:flink-connector-test-utils", version.ref = "flink21" } +flink21-core = { module = "org.apache.flink:flink-core", version.ref = "flink21" } +flink21-runtime = { module = "org.apache.flink:flink-runtime", version.ref = "flink21" } +flink21-test-utils = { module = "org.apache.flink:flink-test-utils", version.ref = "flink21" } +flink21-test-utilsjunit = { module = "org.apache.flink:flink-test-utils-junit", version.ref = "flink21" } guava-testlib = { module = "com.google.guava:guava-testlib", version.ref = "guava" } jakarta-el-api = { module = "jakarta.el:jakarta.el-api", version.ref = "jakarta-el-api" } jakarta-servlet = {module = "jakarta.servlet:jakarta.servlet-api", version.ref = "jakarta-servlet-api"} diff --git a/settings.gradle b/settings.gradle index 57b8f7b58c70..bf84ccee03d1 100644 --- a/settings.gradle +++ b/settings.gradle @@ -139,6 +139,15 @@ if (flinkVersions.contains("2.0")) { project(":iceberg-flink:flink-runtime-2.0").name = "iceberg-flink-runtime-2.0" } +if (flinkVersions.contains("2.1")) { + include ":iceberg-flink:flink-2.1" + include ":iceberg-flink:flink-runtime-2.1" + project(":iceberg-flink:flink-2.1").projectDir = file('flink/v2.1/flink') + project(":iceberg-flink:flink-2.1").name = "iceberg-flink-2.1" + project(":iceberg-flink:flink-runtime-2.1").projectDir = file('flink/v2.1/flink-runtime') + project(":iceberg-flink:flink-runtime-2.1").name = "iceberg-flink-runtime-2.1" +} + if (sparkVersions.contains("3.4")) { include ":iceberg-spark:spark-3.4_${scalaVersion}" include ":iceberg-spark:spark-extensions-3.4_${scalaVersion}" From 0c72ce08876a14c71f3781641106bf6f39c1744b Mon Sep 17 00:00:00 2001 From: Maximilian Michels Date: Thu, 31 Jul 2025 16:14:31 +0200 Subject: [PATCH 4/5] Flink: Code changes for Flink 2.1 --- .../flink/data/FlinkParquetReaders.java | 6 ++++ .../iceberg/flink/data/RowDataProjection.java | 6 ++++ .../iceberg/flink/data/StructRowData.java | 6 ++++ .../flink/sink/IcebergWriteAggregator.java | 28 +++++++++++++++++-- .../dynamic/DynamicWriteResultAggregator.java | 25 ++++++++++++++++- .../flink/sink/TestIcebergCommitter.java | 4 +-- .../iceberg/flink/util/TestFlinkPackage.java | 2 +- 7 files changed, 71 insertions(+), 6 deletions(-) diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java index 5c3581aef3ec..688481e220e3 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java @@ -34,6 +34,7 @@ import org.apache.flink.table.data.RowData; import org.apache.flink.table.data.StringData; import org.apache.flink.table.data.TimestampData; +import org.apache.flink.types.variant.Variant; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; import org.apache.iceberg.parquet.ParquetSchemaUtil; @@ -802,6 +803,11 @@ public RawValueData getRawValue(int pos) { return (RawValueData) values[pos]; } + @Override + public Variant getVariant(int pos) { + return (Variant) values[pos]; + } + @Override public byte[] getBinary(int ordinal) { return (byte[]) values[ordinal]; diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java index 9395b0e4810e..4144b04fe4eb 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java @@ -30,6 +30,7 @@ import org.apache.flink.table.data.TimestampData; import org.apache.flink.table.types.logical.RowType; import org.apache.flink.types.RowKind; +import org.apache.flink.types.variant.Variant; import org.apache.flink.util.StringUtils; import org.apache.iceberg.Schema; import org.apache.iceberg.flink.FlinkRowData; @@ -270,6 +271,11 @@ public RowData getRow(int pos, int numFields) { return (RowData) getValue(pos); } + @Override + public Variant getVariant(int pos) { + return (Variant) getValue(pos); + } + @Override public boolean equals(Object o) { if (this == o) { diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java index 34576a1e5c0b..b93e4346a47a 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java @@ -42,6 +42,7 @@ import org.apache.flink.table.data.StringData; import org.apache.flink.table.data.TimestampData; import org.apache.flink.types.RowKind; +import org.apache.flink.types.variant.Variant; import org.apache.iceberg.StructLike; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -240,6 +241,11 @@ public RowData getRow(int pos, int numFields) { return isNullAt(pos) ? null : getStructRowData(pos); } + @Override + public Variant getVariant(int pos) { + return isNullAt(pos) ? null : struct.get(pos, Variant.class); + } + private StructRowData getStructRowData(int pos) { return new StructRowData( type.fields().get(pos).type().asStructType(), struct.get(pos, StructLike.class)); diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergWriteAggregator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergWriteAggregator.java index 794ade577976..1dbb62363dce 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergWriteAggregator.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergWriteAggregator.java @@ -21,6 +21,8 @@ import java.io.IOException; import java.util.Collection; import org.apache.flink.core.io.SimpleVersionedSerialization; +import org.apache.flink.runtime.checkpoint.CheckpointIDCounter; +import org.apache.flink.runtime.state.StateInitializationContext; import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; import org.apache.flink.streaming.api.connector.sink2.CommittableSummary; import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; @@ -43,18 +45,30 @@ class IcebergWriteAggregator extends AbstractStreamOperator> implements OneInputStreamOperator< CommittableMessage, CommittableMessage> { + private static final Logger LOG = LoggerFactory.getLogger(IcebergWriteAggregator.class); private static final byte[] EMPTY_MANIFEST_DATA = new byte[0]; + private final Collection results; + private final TableLoader tableLoader; + + private long lastCheckpointId = CheckpointIDCounter.INITIAL_CHECKPOINT_ID - 1; + private transient ManifestOutputFileFactory icebergManifestOutputFileFactory; private transient Table table; - private final TableLoader tableLoader; IcebergWriteAggregator(TableLoader tableLoader) { this.results = Sets.newHashSet(); this.tableLoader = tableLoader; } + @Override + public void initializeState(StateInitializationContext context) throws Exception { + context + .getRestoredCheckpointId() + .ifPresent(checkpointId -> this.lastCheckpointId = checkpointId); + } + @Override public void open() throws Exception { if (!tableLoader.isOpen()) { @@ -76,11 +90,21 @@ public void open() throws Exception { @Override public void finish() throws IOException { - prepareSnapshotPreBarrier(Long.MAX_VALUE); + prepareSnapshotPreBarrier(lastCheckpointId + 1); } @Override public void prepareSnapshotPreBarrier(long checkpointId) throws IOException { + if (checkpointId == lastCheckpointId) { + // Already flushed. This can happen when finish() above triggers flushing prior creating the + // final checkpoint. The calls are mutually exclusive, but we need to ensure we don't flush + // twice. + LOG.info("Aggregated writes for checkpoint id {} already flushed.", checkpointId); + return; + } + + this.lastCheckpointId = checkpointId; + IcebergCommittable committable = new IcebergCommittable( writeToManifest(results, checkpointId), diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultAggregator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultAggregator.java index 58ba183dfcd4..c833527ecc55 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultAggregator.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultAggregator.java @@ -26,6 +26,8 @@ import java.util.Map; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.core.io.SimpleVersionedSerialization; +import org.apache.flink.runtime.checkpoint.CheckpointIDCounter; +import org.apache.flink.runtime.state.StateInitializationContext; import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; import org.apache.flink.streaming.api.connector.sink2.CommittableSummary; import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; @@ -56,11 +58,15 @@ class DynamicWriteResultAggregator extends AbstractStreamOperator> implements OneInputStreamOperator< CommittableMessage, CommittableMessage> { + private static final Logger LOG = LoggerFactory.getLogger(DynamicWriteResultAggregator.class); private static final byte[] EMPTY_MANIFEST_DATA = new byte[0]; private static final Duration CACHE_EXPIRATION_DURATION = Duration.ofMinutes(1); private final CatalogLoader catalogLoader; + + private long lastCheckpointId = CheckpointIDCounter.INITIAL_CHECKPOINT_ID - 1; + private transient Map> results; private transient Cache> specs; private transient Cache outputFileFactories; @@ -74,6 +80,13 @@ class DynamicWriteResultAggregator this.catalogLoader = catalogLoader; } + @Override + public void initializeState(StateInitializationContext context) throws Exception { + context + .getRestoredCheckpointId() + .ifPresent(checkpointId -> this.lastCheckpointId = checkpointId); + } + @Override public void open() throws Exception { this.flinkJobId = getContainingTask().getEnvironment().getJobID().toString(); @@ -90,11 +103,21 @@ public void open() throws Exception { @Override public void finish() throws IOException { - prepareSnapshotPreBarrier(Long.MAX_VALUE); + prepareSnapshotPreBarrier(lastCheckpointId + 1); } @Override public void prepareSnapshotPreBarrier(long checkpointId) throws IOException { + if (checkpointId == lastCheckpointId) { + // Already flushed. This can happen when finish() above triggers flushing prior creating the + // final checkpoint. The calls are mutually exclusive, but we need to ensure we don't flush + // twice. + LOG.info("Aggregated writes for checkpoint id {} already flushed.", checkpointId); + return; + } + + this.lastCheckpointId = checkpointId; + Collection> committables = Sets.newHashSetWithExpectedSize(results.size()); int count = 0; diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergCommitter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergCommitter.java index 76338a185a62..abac7e16d9f0 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergCommitter.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergCommitter.java @@ -1044,12 +1044,12 @@ public void testHandleEndInput() throws Exception { testHarness.open(); - long checkpointId = Long.MAX_VALUE; + long checkpointId = 1; processElement(jobId, checkpointId, testHarness, 1, OPERATOR_ID, dataFileTest1); testHarness.endInput(); - assertMaxCommittedCheckpointId(jobId, OPERATOR_ID, Long.MAX_VALUE); + assertMaxCommittedCheckpointId(jobId, OPERATOR_ID, checkpointId); List output = transformsToStreamElement(testHarness.getOutput()); assertThat(output).hasSize(2); diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java index e8054276fa44..8f765ff21fa8 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java @@ -29,7 +29,7 @@ public class TestFlinkPackage { /** This unit test would need to be adjusted as new Flink version is supported. */ @Test public void testVersion() { - assertThat(FlinkPackage.version()).isEqualTo("2.0.0"); + assertThat(FlinkPackage.version()).isEqualTo("2.1.0"); } @Test From d9605b89f853038499d10e699621644d7cda528b Mon Sep 17 00:00:00 2001 From: Maximilian Michels Date: Thu, 31 Jul 2025 16:14:38 +0200 Subject: [PATCH 5/5] Flink: Remove support for Flink 1.19 --- .github/workflows/flink-ci.yml | 2 +- flink/build.gradle | 4 - flink/v1.19/build.gradle | 271 --- flink/v1.19/flink-runtime/LICENSE | 520 ------ flink/v1.19/flink-runtime/NOTICE | 360 ---- .../flink/TestIcebergConnectorSmoke.java | 21 - ...RecordSerializerDeserializerBenchmark.java | 138 -- .../shuffle/MapRangePartitionerBenchmark.java | 121 -- .../SketchRangePartitionerBenchmark.java | 114 -- .../apache/iceberg/flink/CatalogLoader.java | 215 --- .../apache/iceberg/flink/FlinkCatalog.java | 869 ---------- .../iceberg/flink/FlinkCatalogFactory.java | 214 --- .../apache/iceberg/flink/FlinkConfParser.java | 297 ---- .../iceberg/flink/FlinkConfigOptions.java | 113 -- .../flink/FlinkCreateTableOptions.java | 116 -- .../flink/FlinkDynamicTableFactory.java | 236 --- .../flink/FlinkEnvironmentContext.java | 31 - .../apache/iceberg/flink/FlinkFilters.java | 266 --- .../apache/iceberg/flink/FlinkFixupTypes.java | 50 - .../apache/iceberg/flink/FlinkReadConf.java | 213 --- .../iceberg/flink/FlinkReadOptions.java | 123 -- .../apache/iceberg/flink/FlinkRowData.java | 47 - .../apache/iceberg/flink/FlinkSchemaUtil.java | 380 ----- .../iceberg/flink/FlinkSourceFilter.java | 49 - .../apache/iceberg/flink/FlinkTypeToType.java | 203 --- .../iceberg/flink/FlinkTypeVisitor.java | 80 - .../apache/iceberg/flink/FlinkWriteConf.java | 239 --- .../iceberg/flink/FlinkWriteOptions.java | 90 - .../iceberg/flink/IcebergTableSink.java | 192 --- .../apache/iceberg/flink/RowDataWrapper.java | 142 -- .../org/apache/iceberg/flink/TableLoader.java | 159 -- .../apache/iceberg/flink/TypeToFlinkType.java | 146 -- .../apache/iceberg/flink/actions/Actions.java | 52 - .../flink/actions/RewriteDataFilesAction.java | 76 - .../data/AvroWithFlinkSchemaVisitor.java | 75 - .../iceberg/flink/data/FlinkAvroWriter.java | 168 -- .../iceberg/flink/data/FlinkOrcReader.java | 131 -- .../iceberg/flink/data/FlinkOrcReaders.java | 283 ---- .../iceberg/flink/data/FlinkOrcWriter.java | 163 -- .../iceberg/flink/data/FlinkOrcWriters.java | 318 ---- .../flink/data/FlinkParquetReaders.java | 860 ---------- .../flink/data/FlinkParquetWriters.java | 608 ------- .../flink/data/FlinkPlannedAvroReader.java | 196 --- .../flink/data/FlinkSchemaVisitor.java | 161 -- .../iceberg/flink/data/FlinkValueReaders.java | 356 ---- .../iceberg/flink/data/FlinkValueWriters.java | 269 --- .../data/ParquetWithFlinkSchemaVisitor.java | 229 --- .../iceberg/flink/data/RowDataProjection.java | 342 ---- .../iceberg/flink/data/RowDataUtil.java | 109 -- .../iceberg/flink/data/StructRowData.java | 300 ---- .../maintenance/api/DeleteOrphanFiles.java | 324 ---- .../maintenance/api/ExpireSnapshots.java | 144 -- .../api/FlinkMaintenanceConfig.java | 128 -- .../maintenance/api/JdbcLockFactory.java | 325 ---- .../flink/maintenance/api/LockConfig.java | 218 --- .../api/MaintenanceTaskBuilder.java | 229 --- .../maintenance/api/RewriteDataFiles.java | 294 ---- .../api/RewriteDataFilesConfig.java | 184 --- .../maintenance/api/TableMaintenance.java | 336 ---- .../flink/maintenance/api/TaskResult.java | 64 - .../flink/maintenance/api/Trigger.java | 65 - .../maintenance/api/TriggerLockFactory.java | 64 - .../flink/maintenance/api/ZkLockFactory.java | 225 --- .../operator/DataFileRewriteCommitter.java | 199 --- .../operator/DataFileRewritePlanner.java | 217 --- .../operator/DataFileRewriteRunner.java | 253 --- .../operator/DeleteFilesProcessor.java | 121 -- .../operator/ExpireSnapshotsProcessor.java | 136 -- .../maintenance/operator/FileNameReader.java | 49 - .../operator/FileUriKeySelector.java | 60 - .../operator/ListFileSystemFiles.java | 133 -- .../operator/ListMetadataFiles.java | 93 -- .../operator/LockFactoryBuilder.java | 87 - .../maintenance/operator/LockRemover.java | 146 -- .../flink/maintenance/operator/LogUtil.java | 26 - .../operator/MetadataTablePlanner.java | 133 -- .../maintenance/operator/MonitorSource.java | 206 --- .../operator/OrphanFilesDetector.java | 191 --- .../SingleThreadedIteratorSource.java | 197 --- .../maintenance/operator/SkipOnError.java | 90 - .../maintenance/operator/TableChange.java | 244 --- .../operator/TableMaintenanceMetrics.java | 75 - .../maintenance/operator/TableReader.java | 120 -- .../operator/TaskResultAggregator.java | 104 -- .../operator/TriggerEvaluator.java | 151 -- .../maintenance/operator/TriggerManager.java | 327 ---- .../AvroGenericRecordToRowDataMapper.java | 61 - .../flink/sink/BaseDeltaTaskWriter.java | 126 -- .../sink/BucketPartitionKeySelector.java | 70 - .../iceberg/flink/sink/BucketPartitioner.java | 103 -- .../flink/sink/BucketPartitionerUtil.java | 125 -- .../flink/sink/CachingTableSupplier.java | 91 -- .../iceberg/flink/sink/CommitSummary.java | 103 -- .../CommittableToTableChangeConverter.java | 107 -- .../iceberg/flink/sink/DeltaManifests.java | 73 - .../flink/sink/DeltaManifestsSerializer.java | 124 -- .../flink/sink/EqualityFieldKeySelector.java | 88 - .../flink/sink/FlinkAppenderFactory.java | 274 ---- .../flink/sink/FlinkFileWriterFactory.java | 293 ---- .../iceberg/flink/sink/FlinkManifestUtil.java | 168 -- .../apache/iceberg/flink/sink/FlinkSink.java | 773 --------- .../iceberg/flink/sink/FlinkWriteResult.java | 40 - .../flink/sink/IcebergCommittable.java | 95 -- .../sink/IcebergCommittableSerializer.java | 68 - .../iceberg/flink/sink/IcebergCommitter.java | 317 ---- .../flink/sink/IcebergFilesCommitter.java | 483 ------ .../sink/IcebergFilesCommitterMetrics.java | 75 - .../iceberg/flink/sink/IcebergSink.java | 963 ----------- .../iceberg/flink/sink/IcebergSinkWriter.java | 113 -- .../flink/sink/IcebergStreamWriter.java | 121 -- .../sink/IcebergStreamWriterMetrics.java | 100 -- .../flink/sink/IcebergWriteAggregator.java | 127 -- .../flink/sink/ManifestOutputFileFactory.java | 98 -- .../flink/sink/PartitionKeySelector.java | 66 - .../flink/sink/PartitionedDeltaWriter.java | 97 -- .../flink/sink/RowDataTaskWriterFactory.java | 268 --- .../apache/iceberg/flink/sink/SinkUtil.java | 150 -- .../iceberg/flink/sink/TaskWriterFactory.java | 45 - .../flink/sink/UnpartitionedDeltaWriter.java | 69 - .../flink/sink/WriteResultSerializer.java | 63 - .../sink/dynamic/CompareSchemasVisitor.java | 275 ---- .../flink/sink/dynamic/DataConverter.java | 235 --- .../sink/dynamic/DynamicCommittable.java | 104 -- .../dynamic/DynamicCommittableSerializer.java | 71 - .../flink/sink/dynamic/DynamicCommitter.java | 424 ----- .../sink/dynamic/DynamicCommitterMetrics.java | 50 - .../sink/dynamic/DynamicIcebergSink.java | 427 ----- .../flink/sink/dynamic/DynamicRecord.java | 130 -- .../sink/dynamic/DynamicRecordGenerator.java | 34 - .../sink/dynamic/DynamicRecordInternal.java | 166 -- .../DynamicRecordInternalSerializer.java | 295 ---- .../dynamic/DynamicRecordInternalType.java | 103 -- .../sink/dynamic/DynamicRecordProcessor.java | 184 --- .../flink/sink/dynamic/DynamicSinkUtil.java | 65 - .../dynamic/DynamicTableUpdateOperator.java | 84 - .../sink/dynamic/DynamicWriteResult.java | 40 - .../dynamic/DynamicWriteResultAggregator.java | 188 --- .../dynamic/DynamicWriteResultSerializer.java | 62 - .../flink/sink/dynamic/DynamicWriter.java | 213 --- .../sink/dynamic/DynamicWriterMetrics.java | 49 - .../sink/dynamic/EvolveSchemaVisitor.java | 204 --- .../flink/sink/dynamic/HashKeyGenerator.java | 382 ----- .../iceberg/flink/sink/dynamic/LRUCache.java | 64 - .../sink/dynamic/PartitionSpecEvolution.java | 137 -- .../sink/dynamic/TableMetadataCache.java | 272 ---- .../sink/dynamic/TableSerializerCache.java | 133 -- .../flink/sink/dynamic/TableUpdater.java | 214 --- .../flink/sink/dynamic/WriteTarget.java | 144 -- .../shuffle/AggregatedStatisticsTracker.java | 264 --- .../sink/shuffle/CompletedStatistics.java | 128 -- .../CompletedStatisticsSerializer.java | 188 --- .../flink/sink/shuffle/DataStatistics.java | 48 - .../shuffle/DataStatisticsCoordinator.java | 536 ------ .../DataStatisticsCoordinatorProvider.java | 70 - .../sink/shuffle/DataStatisticsOperator.java | 266 --- .../DataStatisticsOperatorFactory.java | 98 -- .../shuffle/DataStatisticsSerializer.java | 204 --- .../flink/sink/shuffle/GlobalStatistics.java | 114 -- .../shuffle/GlobalStatisticsSerializer.java | 199 --- .../flink/sink/shuffle/KeyAssignment.java | 155 -- .../flink/sink/shuffle/MapAssignment.java | 242 --- .../flink/sink/shuffle/MapDataStatistics.java | 88 - .../sink/shuffle/MapRangePartitioner.java | 95 -- .../flink/sink/shuffle/RangePartitioner.java | 110 -- .../shuffle/RequestGlobalStatisticsEvent.java | 40 - .../sink/shuffle/SketchDataStatistics.java | 87 - .../sink/shuffle/SketchRangePartitioner.java | 51 - .../flink/sink/shuffle/SketchUtil.java | 161 -- .../flink/sink/shuffle/SortKeySerializer.java | 411 ----- .../sink/shuffle/SortKeySketchSerializer.java | 143 -- .../flink/sink/shuffle/SortKeyUtil.java | 59 - .../flink/sink/shuffle/StatisticsEvent.java | 76 - .../sink/shuffle/StatisticsOrRecord.java | 112 -- .../shuffle/StatisticsOrRecordSerializer.java | 208 --- .../StatisticsOrRecordTypeInformation.java | 115 -- .../flink/sink/shuffle/StatisticsType.java | 55 - .../flink/sink/shuffle/StatisticsUtil.java | 143 -- .../AvroGenericRecordFileScanTaskReader.java | 42 - .../iceberg/flink/source/DataIterator.java | 160 -- .../iceberg/flink/source/DataTaskReader.java | 47 - .../flink/source/FileScanTaskReader.java | 35 - .../flink/source/FlinkInputFormat.java | 141 -- .../iceberg/flink/source/FlinkInputSplit.java | 48 - .../iceberg/flink/source/FlinkSource.java | 318 ---- .../flink/source/FlinkSplitPlanner.java | 189 --- .../iceberg/flink/source/IcebergSource.java | 702 -------- .../flink/source/IcebergTableSource.java | 238 --- .../source/RowDataFileScanTaskReader.java | 243 --- .../iceberg/flink/source/RowDataRewriter.java | 166 -- .../RowDataToAvroGenericRecordConverter.java | 70 - .../iceberg/flink/source/ScanContext.java | 597 ------- .../iceberg/flink/source/SourceUtil.java | 77 - .../source/StreamingMonitorFunction.java | 269 --- .../flink/source/StreamingReaderOperator.java | 246 --- .../source/StreamingStartingStrategy.java | 61 - .../source/assigner/DefaultSplitAssigner.java | 119 -- .../flink/source/assigner/GetSplitResult.java | 77 - .../assigner/OrderedSplitAssignerFactory.java | 46 - .../assigner/SimpleSplitAssignerFactory.java | 37 - .../flink/source/assigner/SplitAssigner.java | 124 -- .../source/assigner/SplitAssignerFactory.java | 30 - .../source/assigner/SplitAssignerType.java | 33 - .../enumerator/AbstractIcebergEnumerator.java | 188 --- .../ContinuousEnumerationResult.java | 57 - .../ContinuousIcebergEnumerator.java | 188 --- .../enumerator/ContinuousSplitPlanner.java | 30 - .../ContinuousSplitPlannerImpl.java | 248 --- .../source/enumerator/EnumerationHistory.java | 100 -- .../enumerator/IcebergEnumeratorPosition.java | 79 - .../IcebergEnumeratorPositionSerializer.java | 90 - .../enumerator/IcebergEnumeratorState.java | 65 - .../IcebergEnumeratorStateSerializer.java | 194 --- .../enumerator/StaticIcebergEnumerator.java | 51 - .../source/reader/ArrayBatchRecords.java | 171 -- .../reader/ArrayPoolDataIteratorBatcher.java | 130 -- .../reader/AvroGenericRecordConverter.java | 69 - .../AvroGenericRecordReaderFunction.java | 110 -- .../reader/ColumnStatsWatermarkExtractor.java | 98 -- .../reader/ConverterReaderFunction.java | 117 -- .../source/reader/DataIteratorBatcher.java | 36 - .../reader/DataIteratorReaderFunction.java | 43 - .../source/reader/IcebergSourceReader.java | 77 - .../reader/IcebergSourceReaderMetrics.java | 61 - .../reader/IcebergSourceSplitReader.java | 167 -- .../source/reader/LimitableDataIterator.java | 56 - .../flink/source/reader/ListBatchRecords.java | 85 - .../reader/ListDataIteratorBatcher.java | 94 -- .../source/reader/MetaDataReaderFunction.java | 65 - .../flink/source/reader/ReaderFunction.java | 31 - .../source/reader/RecordAndPosition.java | 79 - .../flink/source/reader/RecordFactory.java | 34 - .../flink/source/reader/RecordLimiter.java | 45 - .../flink/source/reader/RowConverter.java | 64 - .../flink/source/reader/RowDataConverter.java | 32 - .../source/reader/RowDataReaderFunction.java | 115 -- .../source/reader/RowDataRecordFactory.java | 74 - .../reader/SerializableRecordEmitter.java | 40 - .../reader/SplitWatermarkExtractor.java | 28 - .../WatermarkExtractorRecordEmitter.java | 67 - .../source/split/IcebergSourceSplit.java | 220 --- .../split/IcebergSourceSplitSerializer.java | 64 - .../source/split/IcebergSourceSplitState.java | 37 - .../split/IcebergSourceSplitStatus.java | 25 - .../source/split/SerializableComparator.java | 24 - .../flink/source/split/SerializerHelper.java | 206 --- .../flink/source/split/SplitComparators.java | 78 - .../flink/source/split/SplitRequestEvent.java | 54 - .../iceberg/flink/util/ElapsedTimeGauge.java | 47 - .../flink/util/FlinkAlterTableUtil.java | 248 --- .../flink/util/FlinkCompatibilityUtil.java | 56 - .../iceberg/flink/util/FlinkPackage.java | 61 - .../org.apache.flink.table.factories.Factory | 16 - ....apache.flink.table.factories.TableFactory | 16 - .../flink/AvroGenericRecordConverterBase.java | 90 - .../apache/iceberg/flink/CatalogTestBase.java | 121 -- .../apache/iceberg/flink/DataGenerator.java | 42 - .../apache/iceberg/flink/DataGenerators.java | 1172 ------------- .../iceberg/flink/HadoopCatalogExtension.java | 105 -- .../iceberg/flink/HadoopTableExtension.java | 59 - .../flink/MiniFlinkClusterExtension.java | 67 - .../iceberg/flink/RowDataConverter.java | 145 -- .../apache/iceberg/flink/SimpleDataUtil.java | 469 ------ .../org/apache/iceberg/flink/SqlBase.java | 110 -- .../org/apache/iceberg/flink/TestBase.java | 145 -- .../iceberg/flink/TestCatalogLoader.java | 116 -- .../iceberg/flink/TestCatalogTableLoader.java | 113 -- .../iceberg/flink/TestChangeLogTable.java | 296 ---- .../flink/TestDataFileSerialization.java | 203 --- .../apache/iceberg/flink/TestFixtures.java | 61 - .../flink/TestFlinkAnonymousTable.java | 65 - .../flink/TestFlinkCatalogDatabase.java | 253 --- .../flink/TestFlinkCatalogFactory.java | 119 -- .../iceberg/flink/TestFlinkCatalogTable.java | 715 -------- .../TestFlinkCatalogTablePartitions.java | 119 -- .../iceberg/flink/TestFlinkConfParser.java | 61 - .../iceberg/flink/TestFlinkFilters.java | 463 ------ .../iceberg/flink/TestFlinkHiveCatalog.java | 101 -- .../iceberg/flink/TestFlinkSchemaUtil.java | 474 ------ .../iceberg/flink/TestFlinkTableSink.java | 266 --- .../flink/TestFlinkTableSinkCompaction.java | 184 --- .../flink/TestFlinkTableSinkExtended.java | 388 ----- .../apache/iceberg/flink/TestFlinkUpsert.java | 336 ---- .../org/apache/iceberg/flink/TestHelpers.java | 669 -------- .../iceberg/flink/TestIcebergConnector.java | 331 ---- .../flink/TestManifestFileSerialization.java | 173 -- .../iceberg/flink/TestRowDataWrapper.java | 94 -- .../apache/iceberg/flink/TestTableLoader.java | 57 - .../iceberg/flink/TestTableSerialization.java | 110 -- .../actions/TestRewriteDataFilesAction.java | 523 ------ .../iceberg/flink/data/RandomRowData.java | 38 - .../flink/data/RowDataToRowMapper.java | 50 - .../flink/data/TestFlinkAvroReaderWriter.java | 136 -- .../flink/data/TestFlinkOrcReaderWriter.java | 118 -- .../flink/data/TestFlinkParquetReader.java | 268 --- .../flink/data/TestFlinkParquetWriter.java | 121 -- .../flink/data/TestRowDataProjection.java | 593 ------- .../iceberg/flink/data/TestRowProjection.java | 583 ------- .../iceberg/flink/data/TestStructRowData.java | 100 -- .../api/MaintenanceTaskInfraExtension.java | 78 - .../api/MaintenanceTaskTestBase.java | 120 -- .../api/TestDeleteOrphanFiles.java | 340 ---- .../maintenance/api/TestExpireSnapshots.java | 233 --- .../maintenance/api/TestJdbcLockFactory.java | 41 - .../maintenance/api/TestLockFactoryBase.java | 93 -- .../maintenance/api/TestMaintenanceE2E.java | 80 - .../maintenance/api/TestRewriteDataFiles.java | 457 ------ .../api/TestRewriteDataFilesConfig.java | 142 -- .../maintenance/api/TestTableMaintenance.java | 462 ------ .../maintenance/api/TestZkLockFactory.java | 54 - .../maintenance/operator/CollectingSink.java | 115 -- .../maintenance/operator/ManualSource.java | 320 ---- .../MetricsReporterFactoryForTests.java | 192 --- .../operator/OperatorTestBase.java | 400 ----- .../maintenance/operator/RewriteUtil.java | 85 - .../TestDataFileRewriteCommitter.java | 278 ---- .../operator/TestDataFileRewritePlanner.java | 210 --- .../operator/TestDataFileRewriteRunner.java | 357 ---- .../operator/TestDeleteFilesProcessor.java | 115 -- .../TestExpireSnapshotsProcessor.java | 122 -- .../operator/TestListFileSystemFiles.java | 110 -- .../operator/TestListMetadataFiles.java | 90 - .../maintenance/operator/TestLockConfig.java | 84 - .../operator/TestLockFactoryBuilder.java | 109 -- .../maintenance/operator/TestLockRemover.java | 443 ----- .../operator/TestMonitorSource.java | 347 ---- .../operator/TestOrphanFilesDetector.java | 246 --- .../maintenance/operator/TestSkipOnError.java | 100 -- .../operator/TestTablePlanerAndReader.java | 120 -- .../operator/TestTaskResultAggregator.java | 78 - .../operator/TestTriggerManager.java | 671 -------- .../iceberg/flink/sink/SinkTestUtil.java | 62 - .../TestAvroGenericRecordToRowDataMapper.java | 38 - .../sink/TestBucketPartitionKeySelector.java | 67 - .../flink/sink/TestBucketPartitioner.java | 108 -- ...TestBucketPartitionerFlinkIcebergSink.java | 233 --- .../flink/sink/TestBucketPartitionerUtil.java | 126 -- .../flink/sink/TestCachingTableSupplier.java | 81 - ...TestCommittableToTableChangeConverter.java | 319 ---- .../flink/sink/TestCompressionSettings.java | 258 --- .../flink/sink/TestDeltaTaskWriter.java | 428 ----- .../flink/sink/TestFlinkAppenderFactory.java | 65 - .../sink/TestFlinkFileWriterFactory.java | 66 - .../flink/sink/TestFlinkIcebergSink.java | 143 -- .../flink/sink/TestFlinkIcebergSinkBase.java | 133 -- .../sink/TestFlinkIcebergSinkBranch.java | 158 -- .../TestFlinkIcebergSinkDistributionMode.java | 602 ------- .../sink/TestFlinkIcebergSinkExtended.java | 250 --- ...IcebergSinkRangeDistributionBucketing.java | 255 --- .../flink/sink/TestFlinkIcebergSinkV2.java | 254 --- .../sink/TestFlinkIcebergSinkV2Base.java | 424 ----- .../sink/TestFlinkIcebergSinkV2Branch.java | 137 -- ...estFlinkIcebergSinkV2DistributionMode.java | 618 ------- .../iceberg/flink/sink/TestFlinkManifest.java | 313 ---- .../sink/TestFlinkPartitioningWriters.java | 77 - .../sink/TestFlinkPositionDeltaWriters.java | 66 - .../sink/TestFlinkRollingFileWriters.java | 51 - .../flink/sink/TestFlinkWriterMetrics.java | 60 - .../flink/sink/TestIcebergCommitter.java | 1446 ----------------- .../flink/sink/TestIcebergFilesCommitter.java | 1238 -------------- .../iceberg/flink/sink/TestIcebergSink.java | 563 ------- .../flink/sink/TestIcebergSinkBranch.java | 142 -- .../flink/sink/TestIcebergSinkCompact.java | 149 -- .../iceberg/flink/sink/TestIcebergSinkV2.java | 287 ---- .../flink/sink/TestIcebergSinkV2Branch.java | 110 -- .../flink/sink/TestIcebergStreamWriter.java | 409 ----- .../flink/sink/TestRowDataPartitionKey.java | 251 --- .../iceberg/flink/sink/TestTaskWriters.java | 241 --- ...namicRecordInternalSerializerTestBase.java | 96 -- .../dynamic/TestCompareSchemasVisitor.java | 229 --- .../TestDynamicCommittableSerializer.java | 62 - .../sink/dynamic/TestDynamicCommitter.java | 381 ----- .../sink/dynamic/TestDynamicIcebergSink.java | 850 ---------- .../dynamic/TestDynamicIcebergSinkPerf.java | 245 --- ...icRecordInternalSerializerWriteSchema.java | 28 - ...RecordInternalSerializerWriteSchemaId.java | 28 - .../TestDynamicTableUpdateOperator.java | 121 -- .../TestDynamicWriteResultAggregator.java | 82 - .../TestDynamicWriteResultSerializer.java | 82 - .../flink/sink/dynamic/TestDynamicWriter.java | 254 --- .../sink/dynamic/TestEvolveSchemaVisitor.java | 626 ------- .../sink/dynamic/TestHashKeyGenerator.java | 350 ---- .../flink/sink/dynamic/TestLRUCache.java | 88 - .../dynamic/TestPartitionSpecEvolution.java | 188 --- .../sink/dynamic/TestRowDataConverter.java | 262 --- .../sink/dynamic/TestTableMetadataCache.java | 94 -- .../dynamic/TestTableSerializerCache.java | 124 -- .../flink/sink/dynamic/TestTableUpdater.java | 160 -- .../sink/shuffle/DataDistributionUtil.java | 178 -- .../iceberg/flink/sink/shuffle/Fixtures.java | 100 -- .../TestAggregatedStatisticsTracker.java | 465 ------ .../TestCompletedStatisticsSerializer.java | 103 -- .../shuffle/TestDataDistributionUtil.java | 49 - .../TestDataStatisticsCoordinator.java | 373 ----- ...TestDataStatisticsCoordinatorProvider.java | 187 --- .../shuffle/TestDataStatisticsOperator.java | 382 ----- .../shuffle/TestDataStatisticsSerializer.java | 53 - .../TestGlobalStatisticsSerializer.java | 59 - .../sink/shuffle/TestMapDataStatistics.java | 67 - .../sink/shuffle/TestMapRangePartitioner.java | 436 ----- .../sink/shuffle/TestRangePartitioner.java | 65 - .../shuffle/TestRangePartitionerSkew.java | 183 --- .../shuffle/TestSketchDataStatistics.java | 60 - .../shuffle/TestSketchRangePartitioner.java | 88 - .../flink/sink/shuffle/TestSketchUtil.java | 200 --- .../shuffle/TestSortKeySerializerBase.java | 65 - .../TestSortKeySerializerNestedStruct.java | 55 - .../TestSortKeySerializerPrimitives.java | 90 - .../TestSortKeySerializerSnapshot.java | 235 --- .../flink/sink/shuffle/TestSortKeyUtil.java | 73 - ...TestStatisticsOrRecordTypeInformation.java | 46 - .../flink/source/BoundedTableFactory.java | 184 --- .../flink/source/BoundedTestSource.java | 108 -- .../flink/source/ChangeLogTableTestBase.java | 95 -- .../iceberg/flink/source/SplitHelpers.java | 200 --- .../iceberg/flink/source/SqlHelpers.java | 60 - .../flink/source/TableSourceTestBase.java | 105 -- .../flink/source/TestBoundedTableFactory.java | 81 - .../flink/source/TestFlinkInputFormat.java | 211 --- .../TestFlinkInputFormatReaderDeletes.java | 71 - .../flink/source/TestFlinkMergingMetrics.java | 67 - .../flink/source/TestFlinkMetaDataTable.java | 813 --------- .../source/TestFlinkReaderDeletesBase.java | 90 - .../iceberg/flink/source/TestFlinkScan.java | 540 ------ .../flink/source/TestFlinkScanSql.java | 69 - .../iceberg/flink/source/TestFlinkSource.java | 90 - .../flink/source/TestFlinkSourceConfig.java | 61 - .../flink/source/TestFlinkSourceSql.java | 87 - .../flink/source/TestFlinkTableSource.java | 561 ------- .../source/TestIcebergSourceBounded.java | 151 -- ...TestIcebergSourceBoundedConverterBase.java | 222 --- ...TestIcebergSourceBoundedGenericRecord.java | 96 -- .../source/TestIcebergSourceBoundedRow.java | 58 - .../source/TestIcebergSourceBoundedSql.java | 76 - .../source/TestIcebergSourceContinuous.java | 573 ------- .../source/TestIcebergSourceFailover.java | 392 ----- ...gSourceFailoverWithWatermarkExtractor.java | 130 -- .../TestIcebergSourceInferParallelism.java | 181 --- .../TestIcebergSourceReaderDeletes.java | 102 -- .../flink/source/TestIcebergSourceSql.java | 233 --- ...stIcebergSourceWithWatermarkExtractor.java | 408 ----- ...estIcebergSpeculativeExecutionSupport.java | 206 --- .../TestMetadataTableReadableMetrics.java | 364 ----- .../flink/source/TestProjectMetaColumn.java | 189 --- ...stRowDataToAvroGenericRecordConverter.java | 36 - .../iceberg/flink/source/TestScanContext.java | 112 -- .../iceberg/flink/source/TestSourceUtil.java | 61 - .../iceberg/flink/source/TestSqlBase.java | 175 -- .../flink/source/TestStreamScanSql.java | 490 ------ .../source/TestStreamingMonitorFunction.java | 399 ----- .../source/TestStreamingReaderOperator.java | 290 ---- .../assigner/SplitAssignerTestBase.java | 132 -- .../assigner/TestDefaultSplitAssigner.java | 43 - ...tFileSequenceNumberBasedSplitAssigner.java | 81 - .../TestWatermarkBasedSplitAssigner.java | 146 -- .../ManualContinuousSplitPlanner.java | 97 -- .../TestContinuousIcebergEnumerator.java | 352 ---- .../TestContinuousSplitPlannerImpl.java | 734 --------- ...ntinuousSplitPlannerImplStartStrategy.java | 219 --- .../enumerator/TestEnumerationHistory.java | 135 -- .../TestIcebergEnumeratorStateSerializer.java | 146 -- .../source/reader/ReaderFunctionTestBase.java | 218 --- .../flink/source/reader/ReaderUtil.java | 128 -- .../source/reader/TestArrayBatchRecords.java | 69 - ...stArrayPoolDataIteratorBatcherRowData.java | 360 ---- .../TestColumnStatsWatermarkExtractor.java | 176 -- .../reader/TestIcebergSourceReader.java | 181 --- .../reader/TestLimitableDataIterator.java | 84 - .../reader/TestRowDataReaderFunction.java | 69 - .../source/reader/TestingMetricGroup.java | 102 -- .../TestIcebergSourceSplitSerializer.java | 183 --- .../iceberg/flink/util/TestFlinkPackage.java | 55 - ...ink.metrics.reporter.MetricReporterFactory | 16 - .../org.apache.flink.table.factories.Factory | 16 - gradle.properties | 2 +- gradle/libs.versions.toml | 12 - settings.gradle | 9 - 476 files changed, 2 insertions(+), 88276 deletions(-) delete mode 100644 flink/v1.19/build.gradle delete mode 100644 flink/v1.19/flink-runtime/LICENSE delete mode 100644 flink/v1.19/flink-runtime/NOTICE delete mode 100644 flink/v1.19/flink-runtime/src/integration/java/org/apache/iceberg/flink/TestIcebergConnectorSmoke.java delete mode 100644 flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordSerializerDeserializerBenchmark.java delete mode 100644 flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java delete mode 100644 flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitionerBenchmark.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCreateTableOptions.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkRowData.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkPlannedAvroReader.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/DeleteOrphanFiles.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ExpireSnapshots.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/FlinkMaintenanceConfig.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/LockConfig.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskBuilder.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFilesConfig.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TableMaintenance.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TaskResult.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/Trigger.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TriggerLockFactory.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ZkLockFactory.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteCommitter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteRunner.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DeleteFilesProcessor.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ExpireSnapshotsProcessor.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileNameReader.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileUriKeySelector.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListFileSystemFiles.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListMetadataFiles.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockFactoryBuilder.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockRemover.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LogUtil.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MetadataTablePlanner.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/OrphanFilesDetector.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SkipOnError.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableMaintenanceMetrics.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableReader.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TaskResultAggregator.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerEvaluator.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerManager.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CommittableToTableChangeConverter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkWriteResult.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittable.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittableSerializer.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommitter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSink.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkWriter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergWriteAggregator.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/SinkUtil.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/WriteResultSerializer.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/CompareSchemasVisitor.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DataConverter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittable.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittableSerializer.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitterMetrics.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordGenerator.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternal.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializer.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalType.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicSinkUtil.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResult.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultAggregator.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultSerializer.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriterMetrics.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/EvolveSchemaVisitor.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/LRUCache.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/PartitionSpecEvolution.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableMetadataCache.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableSerializerCache.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableUpdater.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/WriteTarget.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperatorFactory.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordTypeInformation.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ConverterReaderFunction.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowConverter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataConverter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java delete mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java delete mode 100644 flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory delete mode 100644 flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestBase.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkCompaction.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskInfraExtension.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskTestBase.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestDeleteOrphanFiles.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestExpireSnapshots.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestLockFactoryBase.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestMaintenanceE2E.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFilesConfig.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestTableMaintenance.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestZkLockFactory.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/MetricsReporterFactoryForTests.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteCommitter.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDeleteFilesProcessor.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestExpireSnapshotsProcessor.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListFileSystemFiles.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListMetadataFiles.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockConfig.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockFactoryBuilder.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockRemover.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestOrphanFilesDetector.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestSkipOnError.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTablePlanerAndReader.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTaskResultAggregator.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTriggerManager.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/SinkTestUtil.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCommittableToTableChangeConverter.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkRangeDistributionBucketing.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2DistributionMode.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergCommitter.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSink.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkBranch.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkCompact.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2Branch.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializerTestBase.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestCompareSchemasVisitor.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommittableSerializer.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommitter.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSinkPerf.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchema.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchemaId.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultAggregator.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultSerializer.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriter.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestEvolveSchemaVisitor.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestLRUCache.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestPartitionSpecEvolution.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestRowDataConverter.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableMetadataCache.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableSerializerCache.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableUpdater.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataDistributionUtil.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitionerSkew.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestStatisticsOrRecordTypeInformation.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedConverterBase.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedRow.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceInferParallelism.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java delete mode 100644 flink/v1.19/flink/src/test/resources/META-INF/services/org.apache.flink.metrics.reporter.MetricReporterFactory delete mode 100644 flink/v1.19/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory diff --git a/.github/workflows/flink-ci.yml b/.github/workflows/flink-ci.yml index 18cf01d06ce7..d3f57a70bc5e 100644 --- a/.github/workflows/flink-ci.yml +++ b/.github/workflows/flink-ci.yml @@ -74,7 +74,7 @@ jobs: strategy: matrix: jvm: [11, 17, 21] - flink: ['1.19', '1.20', '2.0'] + flink: ['1.20', '2.0', '2.1'] env: SPARK_LOCAL_IP: localhost steps: diff --git a/flink/build.gradle b/flink/build.gradle index 86509c3b17d4..caf2a443ace4 100644 --- a/flink/build.gradle +++ b/flink/build.gradle @@ -19,10 +19,6 @@ def flinkVersions = (System.getProperty("flinkVersions") != null ? System.getProperty("flinkVersions") : System.getProperty("defaultFlinkVersions")).split(",") -if (flinkVersions.contains("1.19")) { - apply from: file("$projectDir/v1.19/build.gradle") -} - if (flinkVersions.contains("1.20")) { apply from: file("$projectDir/v1.20/build.gradle") } diff --git a/flink/v1.19/build.gradle b/flink/v1.19/build.gradle deleted file mode 100644 index 05fdddd63ccf..000000000000 --- a/flink/v1.19/build.gradle +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -String flinkMajorVersion = '1.19' -String scalaVersion = System.getProperty("scalaVersion") != null ? System.getProperty("scalaVersion") : System.getProperty("defaultScalaVersion") - -project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { - - dependencies { - implementation project(path: ':iceberg-bundled-guava', configuration: 'shadow') - api project(':iceberg-api') - implementation project(':iceberg-common') - implementation project(':iceberg-core') - api project(':iceberg-data') - implementation project(':iceberg-orc') - implementation project(':iceberg-parquet') - implementation project(':iceberg-hive-metastore') - - compileOnly libs.flink119.avro - // for dropwizard histogram metrics implementation - compileOnly libs.flink119.metrics.dropwizard - compileOnly libs.flink119.streaming.java - compileOnly "${libs.flink119.streaming.java.get().module}:${libs.flink119.streaming.java.get().getVersion()}:tests" - compileOnly libs.flink119.table.api.java.bridge - compileOnly "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink119.get()}" - compileOnly libs.flink119.connector.base - compileOnly libs.flink119.connector.files - - compileOnly libs.hadoop3.hdfs - compileOnly libs.hadoop3.common - compileOnly(libs.hadoop3.minicluster) { - exclude group: 'org.apache.avro', module: 'avro' - } - - implementation(libs.parquet.avro) { - exclude group: 'org.apache.avro', module: 'avro' - // already shaded by Parquet - exclude group: 'it.unimi.dsi' - exclude group: 'org.codehaus.jackson' - } - - compileOnly libs.avro.avro - - implementation("${libs.orc.core.get().module}:${libs.versions.orc.get()}:nohive") { - exclude group: 'org.apache.hadoop' - exclude group: 'commons-lang' - // These artifacts are shaded and included in the orc-core fat jar - exclude group: 'com.google.protobuf', module: 'protobuf-java' - exclude group: 'org.apache.hive', module: 'hive-storage-api' - exclude group: 'org.slf4j' - } - - implementation libs.datasketches - - // for caching in DynamicSink - implementation libs.caffeine - - testImplementation libs.flink119.connector.test.utils - testImplementation libs.flink119.core - testImplementation libs.flink119.runtime - testImplementation(libs.flink119.test.utilsjunit) { - exclude group: 'junit' - } - testImplementation(libs.flink119.test.utils) { - exclude group: "org.apache.curator", module: 'curator-test' - exclude group: 'junit' - } - - testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') - testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') - testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') - testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') - - // By default, hive-exec is a fat/uber jar and it exports a guava library - // that's really old. We use the core classifier to be able to override our guava - // version. Luckily, hive-exec seems to work okay so far with this version of guava - // See: https://github.com/apache/hive/blob/master/ql/pom.xml#L911 for more context. - testImplementation("${libs.hive2.exec.get().module}:${libs.hive2.exec.get().getVersion()}:core") { - exclude group: 'org.apache.avro', module: 'avro' - exclude group: 'org.slf4j', module: 'slf4j-log4j12' - exclude group: 'org.pentaho' // missing dependency - exclude group: 'org.apache.hive', module: 'hive-llap-tez' - exclude group: 'org.apache.logging.log4j' - exclude group: 'com.google.protobuf', module: 'protobuf-java' - exclude group: 'org.apache.calcite' - exclude group: 'org.apache.calcite.avatica' - exclude group: 'com.google.code.findbugs', module: 'jsr305' - } - - testImplementation(libs.hive2.metastore) { - exclude group: 'org.apache.avro', module: 'avro' - exclude group: 'org.slf4j', module: 'slf4j-log4j12' - exclude group: 'org.pentaho' // missing dependency - exclude group: 'org.apache.hbase' - exclude group: 'org.apache.logging.log4j' - exclude group: 'co.cask.tephra' - exclude group: 'com.google.code.findbugs', module: 'jsr305' - exclude group: 'org.eclipse.jetty.aggregate', module: 'jetty-all' - exclude group: 'org.eclipse.jetty.orbit', module: 'javax.servlet' - exclude group: 'org.apache.parquet', module: 'parquet-hadoop-bundle' - exclude group: 'com.tdunning', module: 'json' - exclude group: 'javax.transaction', module: 'transaction-api' - exclude group: 'com.zaxxer', module: 'HikariCP' - exclude group: 'org.slf4j' - } - - testImplementation libs.awaitility - testImplementation libs.assertj.core - testImplementation libs.sqlite.jdbc - } - - test { - useJUnitPlatform() - } -} - -project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { - apply plugin: 'com.gradleup.shadow' - - tasks.jar.dependsOn tasks.shadowJar - - sourceSets { - integration { - java.srcDir "$projectDir/src/integration/java" - resources.srcDir "$projectDir/src/integration/resources" - } - } - - configurations { - implementation { - // included in Flink - exclude group: 'org.slf4j' - exclude group: 'org.apache.commons' - exclude group: 'commons-pool' - exclude group: 'commons-codec' - exclude group: 'org.xerial.snappy' - exclude group: 'javax.xml.bind' - exclude group: 'javax.annotation' - } - } - - dependencies { - implementation(project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}")) { - exclude group: 'org.apache.flink' - } - implementation project(':iceberg-aws') - implementation project(':iceberg-azure') - implementation(project(':iceberg-aliyun')) { - exclude group: 'edu.umd.cs.findbugs', module: 'findbugs' - exclude group: 'org.apache.httpcomponents', module: 'httpclient' - exclude group: 'commons-logging', module: 'commons-logging' - } - implementation project(':iceberg-gcp') - implementation(project(':iceberg-nessie')) { - exclude group: 'com.google.code.findbugs', module: 'jsr305' - } - - // for dropwizard histogram metrics implementation - implementation libs.flink119.metrics.dropwizard - - // for integration testing with the flink-runtime-jar - // all of those dependencies are required because the integration test extends FlinkTestBase - integrationCompileOnly project(':iceberg-api') - integrationImplementation libs.assertj.core - integrationImplementation project(path: ":iceberg-flink:iceberg-flink-${flinkMajorVersion}", configuration: "testArtifacts") - integrationImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') - integrationImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') - integrationImplementation(libs.flink119.test.utils) { - exclude group: "org.apache.curator", module: 'curator-test' - exclude group: 'junit' - } - - integrationImplementation libs.flink119.table.api.java.bridge - integrationImplementation "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink119.get()}" - - integrationImplementation libs.hadoop3.common - integrationImplementation libs.hadoop3.hdfs - integrationImplementation(libs.hadoop3.minicluster) { - exclude group: 'org.apache.avro', module: 'avro' - } - - integrationImplementation(libs.hive2.metastore) { - exclude group: 'org.apache.avro', module: 'avro' - exclude group: 'org.slf4j', module: 'slf4j-log4j12' - exclude group: 'org.pentaho' // missing dependency - exclude group: 'org.apache.hbase' - exclude group: 'org.apache.logging.log4j' - exclude group: 'co.cask.tephra' - exclude group: 'com.google.code.findbugs', module: 'jsr305' - exclude group: 'org.eclipse.jetty.aggregate', module: 'jetty-all' - exclude group: 'org.eclipse.jetty.orbit', module: 'javax.servlet' - exclude group: 'org.apache.parquet', module: 'parquet-hadoop-bundle' - exclude group: 'com.tdunning', module: 'json' - exclude group: 'javax.transaction', module: 'transaction-api' - exclude group: 'com.zaxxer', module: 'HikariCP' - exclude group: 'org.slf4j' - } - - integrationImplementation("${libs.hive2.exec.get().module}:${libs.hive2.exec.get().getVersion()}:core") { - exclude group: 'org.apache.avro', module: 'avro' - exclude group: 'org.slf4j', module: 'slf4j-log4j12' - exclude group: 'org.pentaho' // missing dependency - exclude group: 'org.apache.hive', module: 'hive-llap-tez' - exclude group: 'org.apache.logging.log4j' - exclude group: 'com.google.protobuf', module: 'protobuf-java' - exclude group: 'org.apache.calcite' - exclude group: 'org.apache.calcite.avatica' - exclude group: 'com.google.code.findbugs', module: 'jsr305' - } - } - - shadowJar { - configurations = [project.configurations.runtimeClasspath] - - zip64 true - - // include the LICENSE and NOTICE files for the shaded Jar - from(projectDir) { - include 'LICENSE' - include 'NOTICE' - } - - // Relocate dependencies to avoid conflicts - relocate 'org.apache.avro', 'org.apache.iceberg.shaded.org.apache.avro' - relocate 'org.apache.parquet', 'org.apache.iceberg.shaded.org.apache.parquet' - relocate 'com.google.errorprone', 'org.apache.iceberg.shaded.com.google.errorprone' - relocate 'com.google.flatbuffers', 'org.apache.iceberg.shaded.com.google.flatbuffers' - relocate 'com.fasterxml', 'org.apache.iceberg.shaded.com.fasterxml' - relocate 'com.github.benmanes', 'org.apache.iceberg.shaded.com.github.benmanes' - relocate 'org.checkerframework', 'org.apache.iceberg.shaded.org.checkerframework' - relocate 'shaded.parquet', 'org.apache.iceberg.shaded.org.apache.parquet.shaded' - relocate 'org.apache.orc', 'org.apache.iceberg.shaded.org.apache.orc' - relocate 'io.airlift', 'org.apache.iceberg.shaded.io.airlift' - relocate 'org.threeten.extra', 'org.apache.iceberg.shaded.org.threeten.extra' - relocate 'org.apache.hc.client5', 'org.apache.iceberg.shaded.org.apache.hc.client5' - relocate 'org.apache.hc.core5', 'org.apache.iceberg.shaded.org.apache.hc.core5' - - archiveClassifier.set(null) - } - - task integrationTest(type: Test) { - description = "Test Flink Runtime Jar against Flink ${flinkMajorVersion}" - group = "verification" - jvmArgs += project.property('extraJvmArgs') - testClassesDirs = sourceSets.integration.output.classesDirs - classpath = sourceSets.integration.runtimeClasspath + files(shadowJar.archiveFile.get().asFile.path) - inputs.file(shadowJar.archiveFile.get().asFile.path) - } - integrationTest.dependsOn shadowJar - check.dependsOn integrationTest - - jar { - enabled = false - } -} diff --git a/flink/v1.19/flink-runtime/LICENSE b/flink/v1.19/flink-runtime/LICENSE deleted file mode 100644 index 9ca869edb59b..000000000000 --- a/flink/v1.19/flink-runtime/LICENSE +++ /dev/null @@ -1,520 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Avro. - -Copyright: 2014-2020 The Apache Software Foundation. -Home page: https://parquet.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains the Jackson JSON processor. - -Copyright: 2007-2020 Tatu Saloranta and other contributors -Home page: http://jackson.codehaus.org/ -License: http://www.apache.org/licenses/LICENSE-2.0.txt - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Parquet. - -Copyright: 2014-2020 The Apache Software Foundation. -Home page: https://parquet.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Thrift. - -Copyright: 2006-2010 The Apache Software Foundation. -Home page: https://thrift.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains fastutil. - -Copyright: 2002-2014 Sebastiano Vigna -Home page: http://fastutil.di.unimi.it/ -License: http://www.apache.org/licenses/LICENSE-2.0.html - --------------------------------------------------------------------------------- - -This binary artifact contains Apache ORC. - -Copyright: 2013-2020 The Apache Software Foundation. -Home page: https://orc.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Hive's storage API via ORC. - -Copyright: 2013-2020 The Apache Software Foundation. -Home page: https://hive.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Airlift Aircompressor. - -Copyright: 2011-2020 Aircompressor authors. -Home page: https://github.com/airlift/aircompressor -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains JetBrains annotations. - -Copyright: 2000-2020 JetBrains s.r.o. -Home page: https://github.com/JetBrains/java-annotations -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Google Guava. - -Copyright: 2006-2020 The Guava Authors -Home page: https://github.com/google/guava -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Google Error Prone Annotations. - -Copyright: Copyright 2011-2019 The Error Prone Authors -Home page: https://github.com/google/error-prone -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains checkerframework checker-qual Annotations. - -Copyright: 2004-2020 the Checker Framework developers -Home page: https://github.com/typetools/checker-framework -License: https://github.com/typetools/checker-framework/blob/master/LICENSE.txt (MIT license) - -License text: -| The annotations are licensed under the MIT License. (The text of this -| license appears below.) More specifically, all the parts of the Checker -| Framework that you might want to include with your own program use the -| MIT License. This is the checker-qual.jar file and all the files that -| appear in it: every file in a qual/ directory, plus utility files such -| as NullnessUtil.java, RegexUtil.java, SignednessUtil.java, etc. -| In addition, the cleanroom implementations of third-party annotations, -| which the Checker Framework recognizes as aliases for its own -| annotations, are licensed under the MIT License. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This binary artifact contains Caffeine by Ben Manes. - -Copyright: 2014-2020 Ben Manes and contributors -Home page: https://github.com/ben-manes/caffeine -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Google protobuf. - -Copyright: 2008 Google Inc. -Home page: https://developers.google.com/protocol-buffers -License: https://github.com/protocolbuffers/protobuf/blob/master/LICENSE (BSD) - -License text: - -| Copyright 2008 Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| -| Code generated by the Protocol Buffer compiler is owned by the owner -| of the input file used when generating it. This code is not -| standalone and requires a support library to be linked with it. This -| support library is itself covered by the above license. - --------------------------------------------------------------------------------- - -This binary artifact contains ThreeTen. - -Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. -Home page: https://www.threeten.org/threeten-extra/ -License: https://github.com/ThreeTen/threeten-extra/blob/master/LICENSE.txt (BSD 3-clause) - -License text: - -| All rights reserved. -| -| * Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are met: -| -| * Redistributions of source code must retain the above copyright notice, -| this list of conditions and the following disclaimer. -| -| * Redistributions in binary form must reproduce the above copyright notice, -| this list of conditions and the following disclaimer in the documentation -| and/or other materials provided with the distribution. -| -| * Neither the name of JSR-310 nor the names of its contributors -| may be used to endorse or promote products derived from this software -| without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This binary artifact contains Project Nessie. - -Copyright: Copyright 2015-2025 Dremio Corporation -Home page: https://projectnessie.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary includes code from Apache Commons. - -* Core ArrayUtil. - -Copyright: 2020 The Apache Software Foundation -Home page: https://commons.apache.org/ -License: https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Apache HttpComponents Client. - -Copyright: 1999-2022 The Apache Software Foundation. -Home page: https://hc.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product includes code from Apache HttpComponents Client. - -* retry and error handling logic in ExponentialHttpRequestRetryStrategy.java - -Copyright: 1999-2022 The Apache Software Foundation. -Home page: https://hc.apache.org/ -License: https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains failsafe. - -Copyright: Jonathan Halterman and friends -Home page: https://failsafe.dev/ -License: https://www.apache.org/licenses/LICENSE-2.0.html - --------------------------------------------------------------------------------- - -This binary artifact contains Codehale Metrics. - -Copyright: (c) 2010-2013 Coda Hale, Yammer.com, 2014-2021 Dropwizard Team -Home page: https://github.com/dropwizard/metrics -License: https://www.apache.org/licenses/LICENSE-2.0.html - --------------------------------------------------------------------------------- - -This binary artifact contains RoaringBitmap. - -Copyright: (c) 2013-... the RoaringBitmap authors -Home page: https://github.com/RoaringBitmap/RoaringBitmap -License: https://www.apache.org/licenses/LICENSE-2.0.html - --------------------------------------------------------------------------------- - -This binary artifact contains Eclipse Microprofile OpenAPI. - -Copyright: Copyright (c) 2017 Contributors to the Eclipse Foundation -Home page: https://github.com/microprofile/microprofile-open-api -License: https://www.apache.org/licenses/LICENSE-2.0.html - --------------------------------------------------------------------------------- - -This binary artifact contains Luben Zstd. - -Copyright: Copyright (c) 2015-present, Luben Karavelov/ All rights reserved. -Home page: https://github.com/luben/zstd-jni/ -License: BSD-2 License -License text: - -| Zstd-jni: JNI bindings to Zstd Library -| -| Copyright (c) 2015-present, Luben Karavelov/ All rights reserved. -| -| BSD License -| -| Redistribution and use in source and binary forms, with or without modification, -| are permitted provided that the following conditions are met: -| -| * Redistributions of source code must retain the above copyright notice, this -| list of conditions and the following disclaimer. -| -| * Redistributions in binary form must reproduce the above copyright notice, this -| list of conditions and the following disclaimer in the documentation and/or -| other materials provided with the distribution. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -| ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -| (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -| LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -| ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/flink/v1.19/flink-runtime/NOTICE b/flink/v1.19/flink-runtime/NOTICE deleted file mode 100644 index 0838a76b3473..000000000000 --- a/flink/v1.19/flink-runtime/NOTICE +++ /dev/null @@ -1,360 +0,0 @@ - -Apache Iceberg -Copyright 2017-2025 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - --------------------------------------------------------------------------------- - -This binary artifact includes Airlift Aircompressor with the following in its -NOTICE file: - -| Snappy Copyright Notices -| ========================= -| -| * Copyright 2011 Dain Sundstrom -| * Copyright 2011, Google Inc. -| -| -| Snappy License -| =============== -| Copyright 2011, Google Inc. -| All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This binary artifact includes Project Nessie with the following in its NOTICE -file: - -| Nessie -| Copyright 2015-2025 Dremio Corporation -| -| --------------------------------------- -| This project includes code from Apache Polaris (incubating), with the following in its NOTICE file: -| -| | Apache Polaris (incubating) -| | Copyright 2024 The Apache Software Foundation -| | -| | This product includes software developed at -| | The Apache Software Foundation (http://www.apache.org/). -| | -| | The initial code for the Polaris project was donated -| | to the ASF by Snowflake Inc. (https://www.snowflake.com/) copyright 2024. -| -| --------------------------------------- -| This project includes code from Netty, with the following in its NOTICE file: -| -| | The Netty Project -| | ================= -| | -| | Please visit the Netty web site for more information: -| | -| | * https://netty.io/ -| | -| | Copyright 2014 The Netty Project -| | -| | The Netty Project licenses this file to you under the Apache License, -| | version 2.0 (the "License"); you may not use this file except in compliance -| | with the License. You may obtain a copy of the License at: -| | -| | https://www.apache.org/licenses/LICENSE-2.0 -| | -| | Unless required by applicable law or agreed to in writing, software -| | distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -| | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -| | License for the specific language governing permissions and limitations -| | under the License. -| | -| | Also, please refer to each LICENSE..txt file, which is located in -| | the 'license' directory of the distribution file, for the license terms of the -| | components that this product depends on. -| | -| | ------------------------------------------------------------------------------- -| | This product contains the extensions to Java Collections Framework which has -| | been derived from the works by JSR-166 EG, Doug Lea, and Jason T. Greene: -| | -| | * LICENSE: -| | * license/LICENSE.jsr166y.txt (Public Domain) -| | * HOMEPAGE: -| | * http://gee.cs.oswego.edu/cgi-bin/viewcvs.cgi/jsr166/ -| | * http://viewvc.jboss.org/cgi-bin/viewvc.cgi/jbosscache/experimental/jsr166/ -| | -| | This product contains a modified version of Robert Harder's Public Domain -| | Base64 Encoder and Decoder, which can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.base64.txt (Public Domain) -| | * HOMEPAGE: -| | * http://iharder.sourceforge.net/current/java/base64/ -| | -| | This product contains a modified portion of 'Webbit', an event based -| | WebSocket and HTTP server, which can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.webbit.txt (BSD License) -| | * HOMEPAGE: -| | * https://github.com/joewalnes/webbit -| | -| | This product contains a modified portion of 'SLF4J', a simple logging -| | facade for Java, which can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.slf4j.txt (MIT License) -| | * HOMEPAGE: -| | * https://www.slf4j.org/ -| | -| | This product contains a modified portion of 'Apache Harmony', an open source -| | Java SE, which can be obtained at: -| | -| | * NOTICE: -| | * license/NOTICE.harmony.txt -| | * LICENSE: -| | * license/LICENSE.harmony.txt (Apache License 2.0) -| | * HOMEPAGE: -| | * https://archive.apache.org/dist/harmony/ -| | -| | This product contains a modified portion of 'jbzip2', a Java bzip2 compression -| | and decompression library written by Matthew J. Francis. It can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.jbzip2.txt (MIT License) -| | * HOMEPAGE: -| | * https://code.google.com/p/jbzip2/ -| | -| | This product contains a modified portion of 'libdivsufsort', a C API library to construct -| | the suffix array and the Burrows-Wheeler transformed string for any input string of -| | a constant-size alphabet written by Yuta Mori. It can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.libdivsufsort.txt (MIT License) -| | * HOMEPAGE: -| | * https://github.com/y-256/libdivsufsort -| | -| | This product contains a modified portion of Nitsan Wakart's 'JCTools', Java Concurrency Tools for the JVM, -| | which can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.jctools.txt (ASL2 License) -| | * HOMEPAGE: -| | * https://github.com/JCTools/JCTools -| | -| | This product optionally depends on 'JZlib', a re-implementation of zlib in -| | pure Java, which can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.jzlib.txt (BSD style License) -| | * HOMEPAGE: -| | * http://www.jcraft.com/jzlib/ -| | -| | This product optionally depends on 'Compress-LZF', a Java library for encoding and -| | decoding data in LZF format, written by Tatu Saloranta. It can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.compress-lzf.txt (Apache License 2.0) -| | * HOMEPAGE: -| | * https://github.com/ning/compress -| | -| | This product optionally depends on 'lz4', a LZ4 Java compression -| | and decompression library written by Adrien Grand. It can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.lz4.txt (Apache License 2.0) -| | * HOMEPAGE: -| | * https://github.com/jpountz/lz4-java -| | -| | This product optionally depends on 'lzma-java', a LZMA Java compression -| | and decompression library, which can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.lzma-java.txt (Apache License 2.0) -| | * HOMEPAGE: -| | * https://github.com/jponge/lzma-java -| | -| | This product optionally depends on 'zstd-jni', a zstd-jni Java compression -| | and decompression library, which can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.zstd-jni.txt (BSD) -| | * HOMEPAGE: -| | * https://github.com/luben/zstd-jni -| | -| | This product contains a modified portion of 'jfastlz', a Java port of FastLZ compression -| | and decompression library written by William Kinney. It can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.jfastlz.txt (MIT License) -| | * HOMEPAGE: -| | * https://code.google.com/p/jfastlz/ -| | -| | This product contains a modified portion of and optionally depends on 'Protocol Buffers', Google's data -| | interchange format, which can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.protobuf.txt (New BSD License) -| | * HOMEPAGE: -| | * https://github.com/google/protobuf -| | -| | This product optionally depends on 'Bouncy Castle Crypto APIs' to generate -| | a temporary self-signed X.509 certificate when the JVM does not provide the -| | equivalent functionality. It can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.bouncycastle.txt (MIT License) -| | * HOMEPAGE: -| | * https://www.bouncycastle.org/ -| | -| | This product optionally depends on 'Snappy', a compression library produced -| | by Google Inc, which can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.snappy.txt (New BSD License) -| | * HOMEPAGE: -| | * https://github.com/google/snappy -| | -| | This product optionally depends on 'JBoss Marshalling', an alternative Java -| | serialization API, which can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.jboss-marshalling.txt (Apache License 2.0) -| | * HOMEPAGE: -| | * https://github.com/jboss-remoting/jboss-marshalling -| | -| | This product optionally depends on 'Caliper', Google's micro- -| | benchmarking framework, which can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.caliper.txt (Apache License 2.0) -| | * HOMEPAGE: -| | * https://github.com/google/caliper -| | -| | This product optionally depends on 'Apache Commons Logging', a logging -| | framework, which can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.commons-logging.txt (Apache License 2.0) -| | * HOMEPAGE: -| | * https://commons.apache.org/logging/ -| | -| | This product optionally depends on 'Apache Log4J', a logging framework, which -| | can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.log4j.txt (Apache License 2.0) -| | * HOMEPAGE: -| | * https://logging.apache.org/log4j/ -| | -| | This product optionally depends on 'Aalto XML', an ultra-high performance -| | non-blocking XML processor, which can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.aalto-xml.txt (Apache License 2.0) -| | * HOMEPAGE: -| | * https://wiki.fasterxml.com/AaltoHome -| | -| | This product contains a modified version of 'HPACK', a Java implementation of -| | the HTTP/2 HPACK algorithm written by Twitter. It can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.hpack.txt (Apache License 2.0) -| | * HOMEPAGE: -| | * https://github.com/twitter/hpack -| | -| | This product contains a modified version of 'HPACK', a Java implementation of -| | the HTTP/2 HPACK algorithm written by Cory Benfield. It can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.hyper-hpack.txt (MIT License) -| | * HOMEPAGE: -| | * https://github.com/python-hyper/hpack/ -| | -| | This product contains a modified version of 'HPACK', a Java implementation of -| | the HTTP/2 HPACK algorithm written by Tatsuhiro Tsujikawa. It can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.nghttp2-hpack.txt (MIT License) -| | * HOMEPAGE: -| | * https://github.com/nghttp2/nghttp2/ -| | -| | This product contains a modified portion of 'Apache Commons Lang', a Java library -| | provides utilities for the java.lang API, which can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.commons-lang.txt (Apache License 2.0) -| | * HOMEPAGE: -| | * https://commons.apache.org/proper/commons-lang/ -| | -| | -| | This product contains the Maven wrapper scripts from 'Maven Wrapper', that provides an easy way to ensure a user has everything necessary to run the Maven build. -| | -| | * LICENSE: -| | * license/LICENSE.mvn-wrapper.txt (Apache License 2.0) -| | * HOMEPAGE: -| | * https://github.com/takari/maven-wrapper -| | -| | This product contains the dnsinfo.h header file, that provides a way to retrieve the system DNS configuration on MacOS. -| | This private header is also used by Apple's open source -| | mDNSResponder (https://opensource.apple.com/tarballs/mDNSResponder/). -| | -| | * LICENSE: -| | * license/LICENSE.dnsinfo.txt (Apple Public Source License 2.0) -| | * HOMEPAGE: -| | * https://www.opensource.apple.com/source/configd/configd-453.19/dnsinfo/dnsinfo.h -| | -| | This product optionally depends on 'Brotli4j', Brotli compression and -| | decompression for Java., which can be obtained at: -| | -| | * LICENSE: -| | * license/LICENSE.brotli4j.txt (Apache License 2.0) -| | * HOMEPAGE: -| | * https://github.com/hyperxpro/Brotli4j - --------------------------------------------------------------------------------- - -This binary artifact includes Eclipse Microprofile OpenAPI with the following in its NOTICE file: - -| ========================================================================= -| == NOTICE file corresponding to section 4(d) of the Apache License, == -| == Version 2.0, in this case for MicroProfile OpenAPI == -| ========================================================================= -| -| The majority of this software were originally based on the following: -| * Swagger Core -| https://github.com/swagger-api/swagger-core -| under Apache License, v2.0 -| -| -| SPDXVersion: SPDX-2.1 -| PackageName: Eclipse MicroProfile -| PackageHomePage: http://www.eclipse.org/microprofile -| PackageLicenseDeclared: Apache-2.0 -| -| PackageCopyrightText: -| Arthur De Magalhaes arthurdm@ca.ibm.com -| diff --git a/flink/v1.19/flink-runtime/src/integration/java/org/apache/iceberg/flink/TestIcebergConnectorSmoke.java b/flink/v1.19/flink-runtime/src/integration/java/org/apache/iceberg/flink/TestIcebergConnectorSmoke.java deleted file mode 100644 index 3ba3bb71a151..000000000000 --- a/flink/v1.19/flink-runtime/src/integration/java/org/apache/iceberg/flink/TestIcebergConnectorSmoke.java +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -public class TestIcebergConnectorSmoke extends TestIcebergConnector {} diff --git a/flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordSerializerDeserializerBenchmark.java b/flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordSerializerDeserializerBenchmark.java deleted file mode 100644 index d7c3a7b32bc8..000000000000 --- a/flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordSerializerDeserializerBenchmark.java +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.api.common.serialization.SerializerConfig; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.CatalogLoader; -import org.apache.iceberg.flink.RowDataConverter; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.BenchmarkMode; -import org.openjdk.jmh.annotations.Fork; -import org.openjdk.jmh.annotations.Measurement; -import org.openjdk.jmh.annotations.Mode; -import org.openjdk.jmh.annotations.Scope; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.State; -import org.openjdk.jmh.annotations.Threads; -import org.openjdk.jmh.annotations.Warmup; -import org.openjdk.jmh.infra.Blackhole; -import org.openjdk.jmh.runner.Runner; -import org.openjdk.jmh.runner.RunnerException; -import org.openjdk.jmh.runner.options.Options; -import org.openjdk.jmh.runner.options.OptionsBuilder; - -@Fork(1) -@State(Scope.Benchmark) -@Warmup(iterations = 3) -@Measurement(iterations = 5) -@BenchmarkMode(Mode.SingleShotTime) -public class DynamicRecordSerializerDeserializerBenchmark { - private static final int SAMPLE_SIZE = 100_000; - private static final Schema SCHEMA = - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "name2", Types.StringType.get()), - Types.NestedField.required(3, "name3", Types.StringType.get()), - Types.NestedField.required(4, "name4", Types.StringType.get()), - Types.NestedField.required(5, "name5", Types.StringType.get()), - Types.NestedField.required(6, "name6", Types.StringType.get()), - Types.NestedField.required(7, "name7", Types.StringType.get()), - Types.NestedField.required(8, "name8", Types.StringType.get()), - Types.NestedField.required(9, "name9", Types.StringType.get())); - - private List rows = Lists.newArrayListWithExpectedSize(SAMPLE_SIZE); - private DynamicRecordInternalType type; - - public static void main(String[] args) throws RunnerException { - Options options = - new OptionsBuilder() - .include(DynamicRecordSerializerDeserializerBenchmark.class.getSimpleName()) - .build(); - new Runner(options).run(); - } - - @Setup - public void setupBenchmark() throws IOException { - List records = RandomGenericData.generate(SCHEMA, SAMPLE_SIZE, 1L); - this.rows = - records.stream() - .map( - r -> - new DynamicRecordInternal( - "t", - "main", - SCHEMA, - RowDataConverter.convert(SCHEMA, r), - PartitionSpec.unpartitioned(), - 1, - false, - Collections.emptySet())) - .collect(Collectors.toList()); - - File warehouse = Files.createTempFile("perf-bench", null).toFile(); - CatalogLoader catalogLoader = - CatalogLoader.hadoop( - "hadoop", - new Configuration(), - ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouse.getPath())); - this.type = new DynamicRecordInternalType(catalogLoader, true, 100); - } - - @Benchmark - @Threads(1) - public void testSerialize(Blackhole blackhole) throws IOException { - TypeSerializer serializer = - type.createSerializer((SerializerConfig) null); - DataOutputSerializer outputView = new DataOutputSerializer(1024); - for (int i = 0; i < SAMPLE_SIZE; ++i) { - serializer.serialize(rows.get(i), outputView); - } - } - - @Benchmark - @Threads(1) - public void testSerializeAndDeserialize(Blackhole blackhole) throws IOException { - TypeSerializer serializer = - type.createSerializer((SerializerConfig) null); - - DataOutputSerializer outputView = new DataOutputSerializer(1024); - for (int i = 0; i < SAMPLE_SIZE; ++i) { - serializer.serialize(rows.get(i), outputView); - serializer.deserialize(new DataInputDeserializer(outputView.getSharedBuffer())); - } - } -} diff --git a/flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java b/flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java deleted file mode 100644 index 80a46ac530e1..000000000000 --- a/flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.NavigableMap; -import java.util.concurrent.ThreadLocalRandom; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.SortOrderComparators; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.BenchmarkMode; -import org.openjdk.jmh.annotations.Fork; -import org.openjdk.jmh.annotations.Measurement; -import org.openjdk.jmh.annotations.Mode; -import org.openjdk.jmh.annotations.Scope; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.State; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; -import org.openjdk.jmh.annotations.Warmup; -import org.openjdk.jmh.infra.Blackhole; - -@Fork(1) -@State(Scope.Benchmark) -@Warmup(iterations = 3) -@Measurement(iterations = 5) -@BenchmarkMode(Mode.SingleShotTime) -public class MapRangePartitionerBenchmark { - - private static final int SAMPLE_SIZE = 100_000; - private static final Schema SCHEMA = - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "name2", Types.StringType.get()), - Types.NestedField.required(3, "name3", Types.StringType.get()), - Types.NestedField.required(4, "name4", Types.StringType.get()), - Types.NestedField.required(5, "name5", Types.StringType.get()), - Types.NestedField.required(6, "name6", Types.StringType.get()), - Types.NestedField.required(7, "name7", Types.StringType.get()), - Types.NestedField.required(8, "name8", Types.StringType.get()), - Types.NestedField.required(9, "name9", Types.StringType.get())); - - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); - private static final Comparator SORT_ORDER_COMPARTOR = - SortOrderComparators.forSchema(SCHEMA, SORT_ORDER); - private static final SortKey SORT_KEY = new SortKey(SCHEMA, SORT_ORDER); - private static final int PARALLELISM = 100; - - private MapRangePartitioner partitioner; - private RowData[] rows; - - @Setup - public void setupBenchmark() { - NavigableMap weights = - DataDistributionUtil.longTailDistribution(100_000, 24, 240, 100, 2.0, 0.7); - Map mapStatistics = - DataDistributionUtil.mapStatisticsWithLongTailDistribution(weights, SORT_KEY); - - MapAssignment mapAssignment = - MapAssignment.fromKeyFrequency(PARALLELISM, mapStatistics, 0.0, SORT_ORDER_COMPARTOR); - this.partitioner = new MapRangePartitioner(SCHEMA, SORT_ORDER, mapAssignment); - - List keys = Lists.newArrayList(weights.keySet().iterator()); - long[] weightsCDF = DataDistributionUtil.computeCumulativeWeights(keys, weights); - long totalWeight = weightsCDF[weightsCDF.length - 1]; - - // pre-calculate the samples for benchmark run - this.rows = new GenericRowData[SAMPLE_SIZE]; - for (int i = 0; i < SAMPLE_SIZE; ++i) { - long weight = ThreadLocalRandom.current().nextLong(totalWeight); - int index = DataDistributionUtil.binarySearchIndex(weightsCDF, weight); - rows[i] = - GenericRowData.of( - keys.get(index), - DataDistributionUtil.randomString("name2-", 200), - DataDistributionUtil.randomString("name3-", 200), - DataDistributionUtil.randomString("name4-", 200), - DataDistributionUtil.randomString("name5-", 200), - DataDistributionUtil.randomString("name6-", 200), - DataDistributionUtil.randomString("name7-", 200), - DataDistributionUtil.randomString("name8-", 200), - DataDistributionUtil.randomString("name9-", 200)); - } - } - - @TearDown - public void tearDownBenchmark() {} - - @Benchmark - @Threads(1) - public void testPartitionerLongTailDistribution(Blackhole blackhole) { - for (int i = 0; i < SAMPLE_SIZE; ++i) { - blackhole.consume(partitioner.partition(rows[i], PARALLELISM)); - } - } -} diff --git a/flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitionerBenchmark.java b/flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitionerBenchmark.java deleted file mode 100644 index 53a24cd8968a..000000000000 --- a/flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitionerBenchmark.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Arrays; -import java.util.UUID; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.types.Types; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.BenchmarkMode; -import org.openjdk.jmh.annotations.Fork; -import org.openjdk.jmh.annotations.Measurement; -import org.openjdk.jmh.annotations.Mode; -import org.openjdk.jmh.annotations.Scope; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.State; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; -import org.openjdk.jmh.annotations.Warmup; -import org.openjdk.jmh.infra.Blackhole; - -@Fork(1) -@State(Scope.Benchmark) -@Warmup(iterations = 3) -@Measurement(iterations = 5) -@BenchmarkMode(Mode.SingleShotTime) -public class SketchRangePartitionerBenchmark { - - private static final int SAMPLE_SIZE = 100_000; - private static final Schema SCHEMA = - new Schema( - Types.NestedField.required(1, "id", Types.UUIDType.get()), - Types.NestedField.required(2, "name2", Types.StringType.get()), - Types.NestedField.required(3, "name3", Types.StringType.get()), - Types.NestedField.required(4, "name4", Types.StringType.get()), - Types.NestedField.required(5, "name5", Types.StringType.get()), - Types.NestedField.required(6, "name6", Types.StringType.get()), - Types.NestedField.required(7, "name7", Types.StringType.get()), - Types.NestedField.required(8, "name8", Types.StringType.get()), - Types.NestedField.required(9, "name9", Types.StringType.get())); - - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); - private static final SortKey SORT_KEY = new SortKey(SCHEMA, SORT_ORDER); - private static final int PARALLELISM = 100; - - private SketchRangePartitioner partitioner; - private RowData[] rows; - - @Setup - public void setupBenchmark() { - UUID[] reservoir = DataDistributionUtil.reservoirSampleUUIDs(1_000_000, 100_000); - UUID[] rangeBound = DataDistributionUtil.rangeBoundSampleUUIDs(reservoir, PARALLELISM); - SortKey[] rangeBoundSortKeys = - Arrays.stream(rangeBound) - .map( - uuid -> { - SortKey sortKeyCopy = SORT_KEY.copy(); - sortKeyCopy.set(0, uuid); - return sortKeyCopy; - }) - .toArray(SortKey[]::new); - - this.partitioner = new SketchRangePartitioner(SCHEMA, SORT_ORDER, rangeBoundSortKeys); - - // pre-calculate the samples for benchmark run - this.rows = new GenericRowData[SAMPLE_SIZE]; - for (int i = 0; i < SAMPLE_SIZE; ++i) { - UUID uuid = UUID.randomUUID(); - Object uuidBytes = DataDistributionUtil.uuidBytes(uuid); - rows[i] = - GenericRowData.of( - uuidBytes, - DataDistributionUtil.randomString("name2-", 200), - DataDistributionUtil.randomString("name3-", 200), - DataDistributionUtil.randomString("name4-", 200), - DataDistributionUtil.randomString("name5-", 200), - DataDistributionUtil.randomString("name6-", 200), - DataDistributionUtil.randomString("name7-", 200), - DataDistributionUtil.randomString("name8-", 200), - DataDistributionUtil.randomString("name9-", 200)); - } - } - - @TearDown - public void tearDownBenchmark() {} - - @Benchmark - @Threads(1) - public void testPartitionerLongTailDistribution(Blackhole blackhole) { - for (int i = 0; i < SAMPLE_SIZE; ++i) { - blackhole.consume(partitioner.partition(rows[i], PARALLELISM)); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java deleted file mode 100644 index 18473bf4f190..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.Serializable; -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.CatalogUtil; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.hadoop.HadoopCatalog; -import org.apache.iceberg.hadoop.SerializableConfiguration; -import org.apache.iceberg.hive.HiveCatalog; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.rest.RESTCatalog; - -/** Serializable loader to load an Iceberg {@link Catalog}. */ -public interface CatalogLoader extends Serializable, Cloneable { - - /** - * Create a new catalog with the provided properties. NOTICE: for flink, we may initialize the - * {@link CatalogLoader} at flink sql client side or job manager side, and then serialize this - * catalog loader to task manager, finally deserialize it and create a new catalog at task manager - * side. - * - * @return a newly created {@link Catalog} - */ - Catalog loadCatalog(); - - /** Clone a CatalogLoader. */ - @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) - CatalogLoader clone(); - - static CatalogLoader hadoop( - String name, Configuration hadoopConf, Map properties) { - return new HadoopCatalogLoader(name, hadoopConf, properties); - } - - static CatalogLoader hive(String name, Configuration hadoopConf, Map properties) { - return new HiveCatalogLoader(name, hadoopConf, properties); - } - - static CatalogLoader rest(String name, Configuration hadoopConf, Map properties) { - return new RESTCatalogLoader(name, hadoopConf, properties); - } - - static CatalogLoader custom( - String name, Map properties, Configuration hadoopConf, String impl) { - return new CustomCatalogLoader(name, properties, hadoopConf, impl); - } - - class HadoopCatalogLoader implements CatalogLoader { - private final String catalogName; - private final SerializableConfiguration hadoopConf; - private final String warehouseLocation; - private final Map properties; - - private HadoopCatalogLoader( - String catalogName, Configuration conf, Map properties) { - this.catalogName = catalogName; - this.hadoopConf = new SerializableConfiguration(conf); - this.warehouseLocation = properties.get(CatalogProperties.WAREHOUSE_LOCATION); - this.properties = Maps.newHashMap(properties); - } - - @Override - public Catalog loadCatalog() { - return CatalogUtil.loadCatalog( - HadoopCatalog.class.getName(), catalogName, properties, hadoopConf.get()); - } - - @Override - @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) - public CatalogLoader clone() { - return new HadoopCatalogLoader(catalogName, new Configuration(hadoopConf.get()), properties); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("catalogName", catalogName) - .add("warehouseLocation", warehouseLocation) - .toString(); - } - } - - class HiveCatalogLoader implements CatalogLoader { - private final String catalogName; - private final SerializableConfiguration hadoopConf; - private final String uri; - private final String warehouse; - private final int clientPoolSize; - private final Map properties; - - private HiveCatalogLoader( - String catalogName, Configuration conf, Map properties) { - this.catalogName = catalogName; - this.hadoopConf = new SerializableConfiguration(conf); - this.uri = properties.get(CatalogProperties.URI); - this.warehouse = properties.get(CatalogProperties.WAREHOUSE_LOCATION); - this.clientPoolSize = - properties.containsKey(CatalogProperties.CLIENT_POOL_SIZE) - ? Integer.parseInt(properties.get(CatalogProperties.CLIENT_POOL_SIZE)) - : CatalogProperties.CLIENT_POOL_SIZE_DEFAULT; - this.properties = Maps.newHashMap(properties); - } - - @Override - public Catalog loadCatalog() { - return CatalogUtil.loadCatalog( - HiveCatalog.class.getName(), catalogName, properties, hadoopConf.get()); - } - - @Override - @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) - public CatalogLoader clone() { - return new HiveCatalogLoader(catalogName, new Configuration(hadoopConf.get()), properties); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("catalogName", catalogName) - .add("uri", uri) - .add("warehouse", warehouse) - .add("clientPoolSize", clientPoolSize) - .toString(); - } - } - - class RESTCatalogLoader implements CatalogLoader { - private final String catalogName; - private final SerializableConfiguration hadoopConf; - private final Map properties; - - private RESTCatalogLoader( - String catalogName, Configuration conf, Map properties) { - this.catalogName = catalogName; - this.hadoopConf = new SerializableConfiguration(conf); - this.properties = Maps.newHashMap(properties); - } - - @Override - public Catalog loadCatalog() { - return CatalogUtil.loadCatalog( - RESTCatalog.class.getName(), catalogName, properties, hadoopConf.get()); - } - - @Override - @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) - public CatalogLoader clone() { - return new RESTCatalogLoader(catalogName, new Configuration(hadoopConf.get()), properties); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("catalogName", catalogName) - .add("properties", properties) - .toString(); - } - } - - class CustomCatalogLoader implements CatalogLoader { - - private final SerializableConfiguration hadoopConf; - private final Map properties; - private final String name; - private final String impl; - - private CustomCatalogLoader( - String name, Map properties, Configuration conf, String impl) { - this.hadoopConf = new SerializableConfiguration(conf); - this.properties = Maps.newHashMap(properties); // wrap into a hashmap for serialization - this.name = name; - this.impl = - Preconditions.checkNotNull( - impl, "Cannot initialize custom Catalog, impl class name is null"); - } - - @Override - public Catalog loadCatalog() { - return CatalogUtil.loadCatalog(impl, name, properties, hadoopConf.get()); - } - - @Override - @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) - public CatalogLoader clone() { - return new CustomCatalogLoader(name, properties, new Configuration(hadoopConf.get()), impl); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this).add("name", name).add("impl", impl).toString(); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java deleted file mode 100644 index 4c44961df19c..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java +++ /dev/null @@ -1,869 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.Closeable; -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Optional; -import java.util.Set; -import java.util.stream.Collectors; -import org.apache.flink.annotation.Internal; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.catalog.AbstractCatalog; -import org.apache.flink.table.catalog.CatalogBaseTable; -import org.apache.flink.table.catalog.CatalogDatabase; -import org.apache.flink.table.catalog.CatalogDatabaseImpl; -import org.apache.flink.table.catalog.CatalogFunction; -import org.apache.flink.table.catalog.CatalogPartition; -import org.apache.flink.table.catalog.CatalogPartitionSpec; -import org.apache.flink.table.catalog.CatalogTable; -import org.apache.flink.table.catalog.CatalogTableImpl; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.catalog.ResolvedCatalogTable; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.catalog.TableChange; -import org.apache.flink.table.catalog.exceptions.CatalogException; -import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; -import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; -import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; -import org.apache.flink.table.catalog.exceptions.FunctionNotExistException; -import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; -import org.apache.flink.table.catalog.exceptions.TableNotExistException; -import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; -import org.apache.flink.table.catalog.stats.CatalogColumnStatistics; -import org.apache.flink.table.catalog.stats.CatalogTableStatistics; -import org.apache.flink.table.expressions.Expression; -import org.apache.flink.table.factories.Factory; -import org.apache.flink.util.StringUtils; -import org.apache.iceberg.CachingCatalog; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.MetadataTableType; -import org.apache.iceberg.PartitionField; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.SupportsNamespaces; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.exceptions.AlreadyExistsException; -import org.apache.iceberg.exceptions.NamespaceNotEmptyException; -import org.apache.iceberg.exceptions.NoSuchNamespaceException; -import org.apache.iceberg.flink.util.FlinkAlterTableUtil; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.base.Splitter; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; - -/** - * A Flink Catalog implementation that wraps an Iceberg {@link Catalog}. - * - *

    The mapping between Flink database and Iceberg namespace: Supplying a base namespace for a - * given catalog, so if you have a catalog that supports a 2-level namespace, you would supply the - * first level in the catalog configuration and the second level would be exposed as Flink - * databases. - * - *

    The Iceberg table manages its partitions by itself. The partition of the Iceberg table is - * independent of the partition of Flink. - */ -@Internal -public class FlinkCatalog extends AbstractCatalog { - private final CatalogLoader catalogLoader; - private final Catalog icebergCatalog; - private final Namespace baseNamespace; - private final SupportsNamespaces asNamespaceCatalog; - private final Closeable closeable; - private final Map catalogProps; - private final boolean cacheEnabled; - - public FlinkCatalog( - String catalogName, - String defaultDatabase, - Namespace baseNamespace, - CatalogLoader catalogLoader, - Map catalogProps, - boolean cacheEnabled, - long cacheExpirationIntervalMs) { - super(catalogName, defaultDatabase); - this.catalogLoader = catalogLoader; - this.catalogProps = catalogProps; - this.baseNamespace = baseNamespace; - this.cacheEnabled = cacheEnabled; - - Catalog originalCatalog = catalogLoader.loadCatalog(); - icebergCatalog = - cacheEnabled - ? CachingCatalog.wrap(originalCatalog, cacheExpirationIntervalMs) - : originalCatalog; - asNamespaceCatalog = - originalCatalog instanceof SupportsNamespaces ? (SupportsNamespaces) originalCatalog : null; - closeable = originalCatalog instanceof Closeable ? (Closeable) originalCatalog : null; - - FlinkEnvironmentContext.init(); - } - - @Override - public void open() throws CatalogException {} - - @Override - public void close() throws CatalogException { - if (closeable != null) { - try { - closeable.close(); - } catch (IOException e) { - throw new CatalogException(e); - } - } - } - - public Catalog catalog() { - return icebergCatalog; - } - - /** Append a new level to the base namespace */ - private static Namespace appendLevel(Namespace baseNamespace, String newLevel) { - String[] namespace = new String[baseNamespace.levels().length + 1]; - System.arraycopy(baseNamespace.levels(), 0, namespace, 0, baseNamespace.levels().length); - namespace[baseNamespace.levels().length] = newLevel; - return Namespace.of(namespace); - } - - TableIdentifier toIdentifier(ObjectPath path) { - String objectName = path.getObjectName(); - List tableName = Splitter.on('$').splitToList(objectName); - - if (tableName.size() == 1) { - return TableIdentifier.of( - appendLevel(baseNamespace, path.getDatabaseName()), path.getObjectName()); - } else if (tableName.size() == 2 && MetadataTableType.from(tableName.get(1)) != null) { - return TableIdentifier.of( - appendLevel(appendLevel(baseNamespace, path.getDatabaseName()), tableName.get(0)), - tableName.get(1)); - } else { - throw new IllegalArgumentException("Illegal table name:" + objectName); - } - } - - @Override - public List listDatabases() throws CatalogException { - if (asNamespaceCatalog == null) { - return Collections.singletonList(getDefaultDatabase()); - } - - return asNamespaceCatalog.listNamespaces(baseNamespace).stream() - .map(n -> n.level(n.levels().length - 1)) - .collect(Collectors.toList()); - } - - @Override - public CatalogDatabase getDatabase(String databaseName) - throws DatabaseNotExistException, CatalogException { - if (asNamespaceCatalog == null) { - if (!getDefaultDatabase().equals(databaseName)) { - throw new DatabaseNotExistException(getName(), databaseName); - } else { - return new CatalogDatabaseImpl(Maps.newHashMap(), ""); - } - } else { - try { - Map metadata = - Maps.newHashMap( - asNamespaceCatalog.loadNamespaceMetadata(appendLevel(baseNamespace, databaseName))); - String comment = metadata.remove("comment"); - return new CatalogDatabaseImpl(metadata, comment); - } catch (NoSuchNamespaceException e) { - throw new DatabaseNotExistException(getName(), databaseName, e); - } - } - } - - @Override - public boolean databaseExists(String databaseName) throws CatalogException { - try { - getDatabase(databaseName); - return true; - } catch (DatabaseNotExistException ignore) { - return false; - } - } - - @Override - public void createDatabase(String name, CatalogDatabase database, boolean ignoreIfExists) - throws DatabaseAlreadyExistException, CatalogException { - createDatabase( - name, mergeComment(database.getProperties(), database.getComment()), ignoreIfExists); - } - - private void createDatabase( - String databaseName, Map metadata, boolean ignoreIfExists) - throws DatabaseAlreadyExistException, CatalogException { - if (asNamespaceCatalog != null) { - try { - asNamespaceCatalog.createNamespace(appendLevel(baseNamespace, databaseName), metadata); - } catch (AlreadyExistsException e) { - if (!ignoreIfExists) { - throw new DatabaseAlreadyExistException(getName(), databaseName, e); - } - } - } else { - throw new UnsupportedOperationException( - "Namespaces are not supported by catalog: " + getName()); - } - } - - private Map mergeComment(Map metadata, String comment) { - Map ret = Maps.newHashMap(metadata); - if (metadata.containsKey("comment")) { - throw new CatalogException("Database properties should not contain key: 'comment'."); - } - - if (!StringUtils.isNullOrWhitespaceOnly(comment)) { - ret.put("comment", comment); - } - return ret; - } - - @Override - public void dropDatabase(String name, boolean ignoreIfNotExists, boolean cascade) - throws DatabaseNotExistException, DatabaseNotEmptyException, CatalogException { - if (asNamespaceCatalog != null) { - try { - boolean success = asNamespaceCatalog.dropNamespace(appendLevel(baseNamespace, name)); - if (!success && !ignoreIfNotExists) { - throw new DatabaseNotExistException(getName(), name); - } - } catch (NoSuchNamespaceException e) { - if (!ignoreIfNotExists) { - throw new DatabaseNotExistException(getName(), name, e); - } - } catch (NamespaceNotEmptyException e) { - throw new DatabaseNotEmptyException(getName(), name, e); - } - } else { - if (!ignoreIfNotExists) { - throw new DatabaseNotExistException(getName(), name); - } - } - } - - @Override - public void alterDatabase(String name, CatalogDatabase newDatabase, boolean ignoreIfNotExists) - throws DatabaseNotExistException, CatalogException { - if (asNamespaceCatalog != null) { - Namespace namespace = appendLevel(baseNamespace, name); - Map updates = Maps.newHashMap(); - Set removals = Sets.newHashSet(); - - try { - Map oldProperties = asNamespaceCatalog.loadNamespaceMetadata(namespace); - Map newProperties = - mergeComment(newDatabase.getProperties(), newDatabase.getComment()); - - for (String key : oldProperties.keySet()) { - if (!newProperties.containsKey(key)) { - removals.add(key); - } - } - - for (Map.Entry entry : newProperties.entrySet()) { - if (!entry.getValue().equals(oldProperties.get(entry.getKey()))) { - updates.put(entry.getKey(), entry.getValue()); - } - } - - if (!updates.isEmpty()) { - asNamespaceCatalog.setProperties(namespace, updates); - } - - if (!removals.isEmpty()) { - asNamespaceCatalog.removeProperties(namespace, removals); - } - - } catch (NoSuchNamespaceException e) { - if (!ignoreIfNotExists) { - throw new DatabaseNotExistException(getName(), name, e); - } - } - } else { - if (getDefaultDatabase().equals(name)) { - throw new CatalogException( - "Can not alter the default database when the iceberg catalog doesn't support namespaces."); - } - if (!ignoreIfNotExists) { - throw new DatabaseNotExistException(getName(), name); - } - } - } - - @Override - public List listTables(String databaseName) - throws DatabaseNotExistException, CatalogException { - try { - return icebergCatalog.listTables(appendLevel(baseNamespace, databaseName)).stream() - .map(TableIdentifier::name) - .collect(Collectors.toList()); - } catch (NoSuchNamespaceException e) { - throw new DatabaseNotExistException(getName(), databaseName, e); - } - } - - @Override - public CatalogTable getTable(ObjectPath tablePath) - throws TableNotExistException, CatalogException { - Table table = loadIcebergTable(tablePath); - - // Flink's CREATE TABLE LIKE clause relies on properties sent back here to create new table. - // Inorder to create such table in non iceberg catalog, we need to send across catalog - // properties also. - // As Flink API accepts only Map for props, here we are serializing catalog - // props as json string to distinguish between catalog and table properties in createTable. - String srcCatalogProps = - FlinkCreateTableOptions.toJson( - getName(), tablePath.getDatabaseName(), tablePath.getObjectName(), catalogProps); - - Map tableProps = table.properties(); - if (tableProps.containsKey(FlinkCreateTableOptions.CONNECTOR_PROPS_KEY) - || tableProps.containsKey(FlinkCreateTableOptions.SRC_CATALOG_PROPS_KEY)) { - throw new IllegalArgumentException( - String.format( - "Source table %s contains one/all of the reserved property keys: %s, %s.", - tablePath, - FlinkCreateTableOptions.CONNECTOR_PROPS_KEY, - FlinkCreateTableOptions.SRC_CATALOG_PROPS_KEY)); - } - - ImmutableMap.Builder mergedProps = ImmutableMap.builder(); - mergedProps.put( - FlinkCreateTableOptions.CONNECTOR_PROPS_KEY, FlinkDynamicTableFactory.FACTORY_IDENTIFIER); - mergedProps.put(FlinkCreateTableOptions.SRC_CATALOG_PROPS_KEY, srcCatalogProps); - mergedProps.putAll(tableProps); - - return toCatalogTableWithProps(table, mergedProps.build()); - } - - private Table loadIcebergTable(ObjectPath tablePath) throws TableNotExistException { - try { - Table table = icebergCatalog.loadTable(toIdentifier(tablePath)); - if (cacheEnabled) { - table.refresh(); - } - - return table; - } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { - throw new TableNotExistException(getName(), tablePath, e); - } - } - - @Override - public boolean tableExists(ObjectPath tablePath) throws CatalogException { - return icebergCatalog.tableExists(toIdentifier(tablePath)); - } - - @Override - public void dropTable(ObjectPath tablePath, boolean ignoreIfNotExists) - throws TableNotExistException, CatalogException { - try { - icebergCatalog.dropTable(toIdentifier(tablePath)); - } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { - if (!ignoreIfNotExists) { - throw new TableNotExistException(getName(), tablePath, e); - } - } - } - - @Override - public void renameTable(ObjectPath tablePath, String newTableName, boolean ignoreIfNotExists) - throws TableNotExistException, TableAlreadyExistException, CatalogException { - try { - icebergCatalog.renameTable( - toIdentifier(tablePath), - toIdentifier(new ObjectPath(tablePath.getDatabaseName(), newTableName))); - } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { - if (!ignoreIfNotExists) { - throw new TableNotExistException(getName(), tablePath, e); - } - } catch (AlreadyExistsException e) { - throw new TableAlreadyExistException(getName(), tablePath, e); - } - } - - @Override - public void createTable(ObjectPath tablePath, CatalogBaseTable table, boolean ignoreIfExists) - throws CatalogException, TableAlreadyExistException { - // Creating Iceberg table using connector is allowed only when table is created using LIKE - if (Objects.equals( - table.getOptions().get(FlinkCreateTableOptions.CONNECTOR_PROPS_KEY), - FlinkDynamicTableFactory.FACTORY_IDENTIFIER) - && table.getOptions().get(FlinkCreateTableOptions.SRC_CATALOG_PROPS_KEY) == null) { - throw new IllegalArgumentException( - "Cannot create the table with 'connector'='iceberg' table property in " - + "an iceberg catalog, Please create table with 'connector'='iceberg' property in a non-iceberg catalog or " - + "create table without 'connector'='iceberg' related properties in an iceberg table."); - } - - Preconditions.checkArgument(table instanceof ResolvedCatalogTable, "table should be resolved"); - createIcebergTable(tablePath, (ResolvedCatalogTable) table, ignoreIfExists); - } - - void createIcebergTable(ObjectPath tablePath, ResolvedCatalogTable table, boolean ignoreIfExists) - throws CatalogException, TableAlreadyExistException { - validateFlinkTable(table); - - Schema icebergSchema = FlinkSchemaUtil.convert(table.getResolvedSchema()); - PartitionSpec spec = toPartitionSpec(table.getPartitionKeys(), icebergSchema); - ImmutableMap.Builder properties = ImmutableMap.builder(); - String location = null; - for (Map.Entry entry : table.getOptions().entrySet()) { - if (!isReservedProperty(entry.getKey())) { - properties.put(entry.getKey(), entry.getValue()); - } else { - // Filtering reserved properties like catalog properties(added to support CREATE TABLE LIKE - // in getTable()), location and not persisting on table properties. - if (FlinkCreateTableOptions.LOCATION_KEY.equalsIgnoreCase(entry.getKey())) { - location = entry.getValue(); - } - } - } - - try { - icebergCatalog.createTable( - toIdentifier(tablePath), icebergSchema, spec, location, properties.build()); - } catch (AlreadyExistsException e) { - if (!ignoreIfExists) { - throw new TableAlreadyExistException(getName(), tablePath, e); - } - } - } - - private boolean isReservedProperty(String prop) { - return FlinkCreateTableOptions.LOCATION_KEY.equalsIgnoreCase(prop) - || FlinkCreateTableOptions.CONNECTOR_PROPS_KEY.equalsIgnoreCase(prop) - || FlinkCreateTableOptions.SRC_CATALOG_PROPS_KEY.equalsIgnoreCase(prop); - } - - private static void validateTableSchemaAndPartition(CatalogTable ct1, CatalogTable ct2) { - if (!Objects.equals(ct1.getUnresolvedSchema(), ct2.getUnresolvedSchema())) { - throw new UnsupportedOperationException( - "Altering schema is not supported in the old alterTable API. " - + "To alter schema, use the other alterTable API and provide a list of TableChange's."); - } - - validateTablePartition(ct1, ct2); - } - - private static void validateTablePartition(CatalogTable ct1, CatalogTable ct2) { - if (!ct1.getPartitionKeys().equals(ct2.getPartitionKeys())) { - throw new UnsupportedOperationException("Altering partition keys is not supported yet."); - } - } - - /** - * This alterTable API only supports altering table properties. - * - *

    Support for adding/removing/renaming columns cannot be done by comparing CatalogTable - * instances, unless the Flink schema contains Iceberg column IDs. - * - *

    To alter columns, use the other alterTable API and provide a list of TableChange's. - * - * @param tablePath path of the table or view to be modified - * @param newTable the new table definition - * @param ignoreIfNotExists flag to specify behavior when the table or view does not exist: if set - * to false, throw an exception, if set to true, do nothing. - * @throws CatalogException in case of any runtime exception - * @throws TableNotExistException if the table does not exist - */ - @Override - public void alterTable(ObjectPath tablePath, CatalogBaseTable newTable, boolean ignoreIfNotExists) - throws CatalogException, TableNotExistException { - validateFlinkTable(newTable); - - Table icebergTable; - try { - icebergTable = loadIcebergTable(tablePath); - } catch (TableNotExistException e) { - if (!ignoreIfNotExists) { - throw e; - } else { - return; - } - } - - CatalogTable table = toCatalogTable(icebergTable); - validateTableSchemaAndPartition(table, (CatalogTable) newTable); - - Map oldProperties = table.getOptions(); - Map setProperties = Maps.newHashMap(); - - String setLocation = null; - String setSnapshotId = null; - String pickSnapshotId = null; - - for (Map.Entry entry : newTable.getOptions().entrySet()) { - String key = entry.getKey(); - String value = entry.getValue(); - - if (Objects.equals(value, oldProperties.get(key))) { - continue; - } - - if (FlinkCreateTableOptions.LOCATION_KEY.equalsIgnoreCase(key)) { - setLocation = value; - } else if ("current-snapshot-id".equalsIgnoreCase(key)) { - setSnapshotId = value; - } else if ("cherry-pick-snapshot-id".equalsIgnoreCase(key)) { - pickSnapshotId = value; - } else { - setProperties.put(key, value); - } - } - - oldProperties - .keySet() - .forEach( - k -> { - if (!newTable.getOptions().containsKey(k)) { - setProperties.put(k, null); - } - }); - - FlinkAlterTableUtil.commitChanges( - icebergTable, setLocation, setSnapshotId, pickSnapshotId, setProperties); - } - - @Override - public void alterTable( - ObjectPath tablePath, - CatalogBaseTable newTable, - List tableChanges, - boolean ignoreIfNotExists) - throws TableNotExistException, CatalogException { - validateFlinkTable(newTable); - - Table icebergTable; - try { - icebergTable = loadIcebergTable(tablePath); - } catch (TableNotExistException e) { - if (!ignoreIfNotExists) { - throw e; - } else { - return; - } - } - - // Does not support altering partition yet. - validateTablePartition(toCatalogTable(icebergTable), (CatalogTable) newTable); - - String setLocation = null; - String setSnapshotId = null; - String cherrypickSnapshotId = null; - - List propertyChanges = Lists.newArrayList(); - List schemaChanges = Lists.newArrayList(); - for (TableChange change : tableChanges) { - if (change instanceof TableChange.SetOption) { - TableChange.SetOption set = (TableChange.SetOption) change; - - if (FlinkCreateTableOptions.LOCATION_KEY.equalsIgnoreCase(set.getKey())) { - setLocation = set.getValue(); - } else if ("current-snapshot-id".equalsIgnoreCase(set.getKey())) { - setSnapshotId = set.getValue(); - } else if ("cherry-pick-snapshot-id".equalsIgnoreCase(set.getKey())) { - cherrypickSnapshotId = set.getValue(); - } else { - propertyChanges.add(change); - } - } else if (change instanceof TableChange.ResetOption) { - propertyChanges.add(change); - } else { - schemaChanges.add(change); - } - } - - FlinkAlterTableUtil.commitChanges( - icebergTable, - setLocation, - setSnapshotId, - cherrypickSnapshotId, - schemaChanges, - propertyChanges); - } - - private static void validateFlinkTable(CatalogBaseTable table) { - Preconditions.checkArgument( - table instanceof CatalogTable, "The Table should be a CatalogTable."); - - org.apache.flink.table.api.Schema schema = table.getUnresolvedSchema(); - schema - .getColumns() - .forEach( - column -> { - if (!FlinkCompatibilityUtil.isPhysicalColumn(column)) { - throw new UnsupportedOperationException( - "Creating table with computed columns is not supported yet."); - } - }); - - if (!schema.getWatermarkSpecs().isEmpty()) { - throw new UnsupportedOperationException( - "Creating table with watermark specs is not supported yet."); - } - } - - private static PartitionSpec toPartitionSpec(List partitionKeys, Schema icebergSchema) { - PartitionSpec.Builder builder = PartitionSpec.builderFor(icebergSchema); - partitionKeys.forEach(builder::identity); - return builder.build(); - } - - private static List toPartitionKeys(PartitionSpec spec, Schema icebergSchema) { - ImmutableList.Builder partitionKeysBuilder = ImmutableList.builder(); - for (PartitionField field : spec.fields()) { - if (field.transform().isIdentity()) { - partitionKeysBuilder.add(icebergSchema.findColumnName(field.sourceId())); - } else { - // Not created by Flink SQL. - // For compatibility with iceberg tables, return empty. - // TODO modify this after Flink support partition transform. - return Collections.emptyList(); - } - } - return partitionKeysBuilder.build(); - } - - static CatalogTable toCatalogTableWithProps(Table table, Map props) { - ResolvedSchema resolvedSchema = FlinkSchemaUtil.toResolvedSchema(table.schema()); - List partitionKeys = toPartitionKeys(table.spec(), table.schema()); - - // NOTE: We can not create a IcebergCatalogTable extends CatalogTable, because Flink optimizer - // may use CatalogTableImpl to copy a new catalog table. - // Let's re-loading table from Iceberg catalog when creating source/sink operators. - // Iceberg does not have Table comment, so pass a null (Default comment value in Flink). - return new CatalogTableImpl( - TableSchema.fromResolvedSchema(resolvedSchema), partitionKeys, props, null); - } - - static CatalogTable toCatalogTable(Table table) { - return toCatalogTableWithProps(table, table.properties()); - } - - @Override - public Optional getFactory() { - return Optional.of(new FlinkDynamicTableFactory(this)); - } - - CatalogLoader getCatalogLoader() { - return catalogLoader; - } - - // ------------------------------ Unsupported methods - // --------------------------------------------- - - @Override - public List listViews(String databaseName) throws CatalogException { - return Collections.emptyList(); - } - - @Override - public CatalogPartition getPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public boolean partitionExists(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void createPartition( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogPartition partition, - boolean ignoreIfExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void dropPartition( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterPartition( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogPartition newPartition, - boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public List listFunctions(String dbName) throws CatalogException { - return Collections.emptyList(); - } - - @Override - public CatalogFunction getFunction(ObjectPath functionPath) - throws FunctionNotExistException, CatalogException { - throw new FunctionNotExistException(getName(), functionPath); - } - - @Override - public boolean functionExists(ObjectPath functionPath) throws CatalogException { - return false; - } - - @Override - public void createFunction( - ObjectPath functionPath, CatalogFunction function, boolean ignoreIfExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterFunction( - ObjectPath functionPath, CatalogFunction newFunction, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void dropFunction(ObjectPath functionPath, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterTableStatistics( - ObjectPath tablePath, CatalogTableStatistics tableStatistics, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterTableColumnStatistics( - ObjectPath tablePath, CatalogColumnStatistics columnStatistics, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterPartitionStatistics( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogTableStatistics partitionStatistics, - boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterPartitionColumnStatistics( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogColumnStatistics columnStatistics, - boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public List listPartitions(ObjectPath tablePath) - throws TableNotExistException, TableNotPartitionedException, CatalogException { - Table table = loadIcebergTable(tablePath); - - if (table.spec().isUnpartitioned()) { - throw new TableNotPartitionedException(icebergCatalog.name(), tablePath); - } - - Set set = Sets.newHashSet(); - try (CloseableIterable tasks = table.newScan().planFiles()) { - for (DataFile dataFile : CloseableIterable.transform(tasks, FileScanTask::file)) { - Map map = Maps.newHashMap(); - StructLike structLike = dataFile.partition(); - PartitionSpec spec = table.specs().get(dataFile.specId()); - for (int i = 0; i < structLike.size(); i++) { - map.put(spec.fields().get(i).name(), String.valueOf(structLike.get(i, Object.class))); - } - set.add(new CatalogPartitionSpec(map)); - } - } catch (IOException e) { - throw new CatalogException( - String.format("Failed to list partitions of table %s", tablePath), e); - } - - return Lists.newArrayList(set); - } - - @Override - public List listPartitions( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public List listPartitionsByFilter( - ObjectPath tablePath, List filters) throws CatalogException { - throw new UnsupportedOperationException(); - } - - // After partition pruning and filter push down, the statistics have become very inaccurate, so - // the statistics from - // here are of little significance. - // Flink will support something like SupportsReportStatistics in future. - - @Override - public CatalogTableStatistics getTableStatistics(ObjectPath tablePath) throws CatalogException { - return CatalogTableStatistics.UNKNOWN; - } - - @Override - public CatalogColumnStatistics getTableColumnStatistics(ObjectPath tablePath) - throws CatalogException { - return CatalogColumnStatistics.UNKNOWN; - } - - @Override - public CatalogTableStatistics getPartitionStatistics( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { - return CatalogTableStatistics.UNKNOWN; - } - - @Override - public CatalogColumnStatistics getPartitionColumnStatistics( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { - return CatalogColumnStatistics.UNKNOWN; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java deleted file mode 100644 index dd065617bd88..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.net.URL; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import org.apache.flink.configuration.GlobalConfiguration; -import org.apache.flink.runtime.util.HadoopUtils; -import org.apache.flink.table.catalog.Catalog; -import org.apache.flink.table.factories.CatalogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.base.Strings; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.PropertyUtil; - -/** - * A Flink Catalog factory implementation that creates {@link FlinkCatalog}. - * - *

    This supports the following catalog configuration options: - * - *

      - *
    • type - Flink catalog factory key, should be "iceberg" - *
    • catalog-type - iceberg catalog type, "hive", "hadoop" or "rest" - *
    • uri - the Hive Metastore URI (Hive catalog only) - *
    • clients - the Hive Client Pool Size (Hive catalog only) - *
    • warehouse - the warehouse path (Hadoop catalog only) - *
    • default-database - a database name to use as the default - *
    • base-namespace - a base namespace as the prefix for all databases (Hadoop - * catalog only) - *
    • cache-enabled - whether to enable catalog cache - *
    - * - *

    To use a custom catalog that is not a Hive or Hadoop catalog, extend this class and override - * {@link #createCatalogLoader(String, Map, Configuration)}. - */ -public class FlinkCatalogFactory implements CatalogFactory { - - // Can not just use "type", it conflicts with CATALOG_TYPE. - public static final String ICEBERG_CATALOG_TYPE = "catalog-type"; - public static final String ICEBERG_CATALOG_TYPE_HADOOP = "hadoop"; - public static final String ICEBERG_CATALOG_TYPE_HIVE = "hive"; - public static final String ICEBERG_CATALOG_TYPE_REST = "rest"; - - public static final String HIVE_CONF_DIR = "hive-conf-dir"; - public static final String HADOOP_CONF_DIR = "hadoop-conf-dir"; - public static final String DEFAULT_DATABASE = "default-database"; - public static final String DEFAULT_DATABASE_NAME = "default"; - public static final String DEFAULT_CATALOG_NAME = "default_catalog"; - public static final String BASE_NAMESPACE = "base-namespace"; - public static final String TYPE = "type"; - public static final String PROPERTY_VERSION = "property-version"; - - /** - * Create an Iceberg {@link org.apache.iceberg.catalog.Catalog} loader to be used by this Flink - * catalog adapter. - * - * @param name Flink's catalog name - * @param properties Flink's catalog properties - * @param hadoopConf Hadoop configuration for catalog - * @return an Iceberg catalog loader - */ - static CatalogLoader createCatalogLoader( - String name, Map properties, Configuration hadoopConf) { - String catalogImpl = properties.get(CatalogProperties.CATALOG_IMPL); - if (catalogImpl != null) { - String catalogType = properties.get(ICEBERG_CATALOG_TYPE); - Preconditions.checkArgument( - catalogType == null, - "Cannot create catalog %s, both catalog-type and catalog-impl are set: catalog-type=%s, catalog-impl=%s", - name, - catalogType, - catalogImpl); - return CatalogLoader.custom(name, properties, hadoopConf, catalogImpl); - } - - String catalogType = properties.getOrDefault(ICEBERG_CATALOG_TYPE, ICEBERG_CATALOG_TYPE_HIVE); - switch (catalogType.toLowerCase(Locale.ENGLISH)) { - case ICEBERG_CATALOG_TYPE_HIVE: - // The values of properties 'uri', 'warehouse', 'hive-conf-dir' are allowed to be null, in - // that case it will - // fallback to parse those values from hadoop configuration which is loaded from classpath. - String hiveConfDir = properties.get(HIVE_CONF_DIR); - String hadoopConfDir = properties.get(HADOOP_CONF_DIR); - Configuration newHadoopConf = mergeHiveConf(hadoopConf, hiveConfDir, hadoopConfDir); - return CatalogLoader.hive(name, newHadoopConf, properties); - - case ICEBERG_CATALOG_TYPE_HADOOP: - return CatalogLoader.hadoop(name, hadoopConf, properties); - - case ICEBERG_CATALOG_TYPE_REST: - return CatalogLoader.rest(name, hadoopConf, properties); - - default: - throw new UnsupportedOperationException( - "Unknown catalog-type: " + catalogType + " (Must be 'hive', 'hadoop' or 'rest')"); - } - } - - @Override - public Map requiredContext() { - Map context = Maps.newHashMap(); - context.put(TYPE, "iceberg"); - context.put(PROPERTY_VERSION, "1"); - return context; - } - - @Override - public List supportedProperties() { - return ImmutableList.of("*"); - } - - @Override - public Catalog createCatalog(String name, Map properties) { - return createCatalog(name, properties, clusterHadoopConf()); - } - - protected Catalog createCatalog( - String name, Map properties, Configuration hadoopConf) { - CatalogLoader catalogLoader = createCatalogLoader(name, properties, hadoopConf); - String defaultDatabase = properties.getOrDefault(DEFAULT_DATABASE, DEFAULT_DATABASE_NAME); - - Namespace baseNamespace = Namespace.empty(); - if (properties.containsKey(BASE_NAMESPACE)) { - baseNamespace = Namespace.of(properties.get(BASE_NAMESPACE).split("\\.")); - } - - boolean cacheEnabled = - PropertyUtil.propertyAsBoolean( - properties, CatalogProperties.CACHE_ENABLED, CatalogProperties.CACHE_ENABLED_DEFAULT); - - long cacheExpirationIntervalMs = - PropertyUtil.propertyAsLong( - properties, - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS_OFF); - Preconditions.checkArgument( - cacheExpirationIntervalMs != 0, - "%s is not allowed to be 0.", - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS); - - return new FlinkCatalog( - name, - defaultDatabase, - baseNamespace, - catalogLoader, - properties, - cacheEnabled, - cacheExpirationIntervalMs); - } - - private static Configuration mergeHiveConf( - Configuration hadoopConf, String hiveConfDir, String hadoopConfDir) { - Configuration newConf = new Configuration(hadoopConf); - if (!Strings.isNullOrEmpty(hiveConfDir)) { - Preconditions.checkState( - Files.exists(Paths.get(hiveConfDir, "hive-site.xml")), - "There should be a hive-site.xml file under the directory %s", - hiveConfDir); - newConf.addResource(new Path(hiveConfDir, "hive-site.xml")); - } else { - // If don't provide the hive-site.xml path explicitly, it will try to load resource from - // classpath. If still - // couldn't load the configuration file, then it will throw exception in HiveCatalog. - URL configFile = CatalogLoader.class.getClassLoader().getResource("hive-site.xml"); - if (configFile != null) { - newConf.addResource(configFile); - } - } - - if (!Strings.isNullOrEmpty(hadoopConfDir)) { - Preconditions.checkState( - Files.exists(Paths.get(hadoopConfDir, "hdfs-site.xml")), - "Failed to load Hadoop configuration: missing %s", - Paths.get(hadoopConfDir, "hdfs-site.xml")); - newConf.addResource(new Path(hadoopConfDir, "hdfs-site.xml")); - Preconditions.checkState( - Files.exists(Paths.get(hadoopConfDir, "core-site.xml")), - "Failed to load Hadoop configuration: missing %s", - Paths.get(hadoopConfDir, "core-site.xml")); - newConf.addResource(new Path(hadoopConfDir, "core-site.xml")); - } - - return newConf; - } - - public static Configuration clusterHadoopConf() { - return HadoopUtils.getHadoopConfiguration(GlobalConfiguration.loadConfiguration()); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java deleted file mode 100644 index e0672811cf5f..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java +++ /dev/null @@ -1,297 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.time.Duration; -import java.util.List; -import java.util.Map; -import java.util.function.Function; -import org.apache.flink.annotation.Internal; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.util.TimeUtils; -import org.apache.iceberg.Table; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -@Internal -public class FlinkConfParser { - - private final Map tableProperties; - private final Map options; - private final ReadableConfig readableConfig; - - public FlinkConfParser(Table table, Map options, ReadableConfig readableConfig) { - this.tableProperties = table.properties(); - this.options = options; - this.readableConfig = readableConfig; - } - - FlinkConfParser(Map options, ReadableConfig readableConfig) { - this.tableProperties = ImmutableMap.of(); - this.options = options; - this.readableConfig = readableConfig; - } - - public BooleanConfParser booleanConf() { - return new BooleanConfParser(); - } - - public IntConfParser intConf() { - return new IntConfParser(); - } - - public LongConfParser longConf() { - return new LongConfParser(); - } - - public DoubleConfParser doubleConf() { - return new DoubleConfParser(); - } - - public > EnumConfParser enumConfParser(Class enumClass) { - return new EnumConfParser<>(enumClass); - } - - public StringConfParser stringConf() { - return new StringConfParser(); - } - - public DurationConfParser durationConf() { - return new DurationConfParser(); - } - - public class BooleanConfParser extends ConfParser { - private Boolean defaultValue; - - @Override - protected BooleanConfParser self() { - return this; - } - - public BooleanConfParser defaultValue(boolean value) { - this.defaultValue = value; - return self(); - } - - public BooleanConfParser defaultValue(String value) { - this.defaultValue = Boolean.parseBoolean(value); - return self(); - } - - public boolean parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(Boolean::parseBoolean, defaultValue); - } - } - - public class IntConfParser extends ConfParser { - private Integer defaultValue; - - @Override - protected IntConfParser self() { - return this; - } - - public IntConfParser defaultValue(int value) { - this.defaultValue = value; - return self(); - } - - public int parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(Integer::parseInt, defaultValue); - } - - public Integer parseOptional() { - return parse(Integer::parseInt, null); - } - } - - public class LongConfParser extends ConfParser { - private Long defaultValue; - - @Override - protected LongConfParser self() { - return this; - } - - public LongConfParser defaultValue(long value) { - this.defaultValue = value; - return self(); - } - - public long parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(Long::parseLong, defaultValue); - } - - public Long parseOptional() { - return parse(Long::parseLong, null); - } - } - - public class DoubleConfParser extends ConfParser { - private Double defaultValue; - - @Override - protected DoubleConfParser self() { - return this; - } - - public DoubleConfParser defaultValue(double value) { - this.defaultValue = value; - return self(); - } - - public double parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(Double::parseDouble, defaultValue); - } - - public Double parseOptional() { - return parse(Double::parseDouble, null); - } - } - - public class StringConfParser extends ConfParser { - private String defaultValue; - - @Override - protected StringConfParser self() { - return this; - } - - public StringConfParser defaultValue(String value) { - this.defaultValue = value; - return self(); - } - - public String parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(Function.identity(), defaultValue); - } - - public String parseOptional() { - return parse(Function.identity(), null); - } - } - - public class EnumConfParser> extends ConfParser, E> { - private E defaultValue; - private final Class enumClass; - - EnumConfParser(Class enumClass) { - this.enumClass = enumClass; - } - - @Override - protected EnumConfParser self() { - return this; - } - - public EnumConfParser defaultValue(E value) { - this.defaultValue = value; - return self(); - } - - public E parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(s -> Enum.valueOf(enumClass, s), defaultValue); - } - - public E parseOptional() { - return parse(s -> Enum.valueOf(enumClass, s), null); - } - } - - public class DurationConfParser extends ConfParser { - private Duration defaultValue; - - @Override - protected DurationConfParser self() { - return this; - } - - public DurationConfParser defaultValue(Duration value) { - this.defaultValue = value; - return self(); - } - - public Duration parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(TimeUtils::parseDuration, defaultValue); - } - - public Duration parseOptional() { - return parse(TimeUtils::parseDuration, null); - } - } - - public abstract class ConfParser { - private final List optionNames = Lists.newArrayList(); - private String tablePropertyName; - private ConfigOption configOption; - - protected abstract ThisT self(); - - public ThisT option(String name) { - this.optionNames.add(name); - return self(); - } - - public ThisT flinkConfig(ConfigOption newConfigOption) { - this.configOption = newConfigOption; - return self(); - } - - public ThisT tableProperty(String name) { - this.tablePropertyName = name; - return self(); - } - - protected T parse(Function conversion, T defaultValue) { - if (!optionNames.isEmpty()) { - for (String optionName : optionNames) { - String optionValue = options.get(optionName); - if (optionValue != null) { - return conversion.apply(optionValue); - } - } - } - - if (configOption != null) { - T propertyValue = readableConfig.get(configOption); - if (propertyValue != null) { - return propertyValue; - } - } - - if (tablePropertyName != null) { - String propertyValue = tableProperties.get(tablePropertyName); - if (propertyValue != null) { - return conversion.apply(propertyValue); - } - } - - return defaultValue; - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java deleted file mode 100644 index 6362bc447634..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.description.Description; -import org.apache.flink.configuration.description.TextElement; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.iceberg.flink.source.assigner.SplitAssignerType; -import org.apache.iceberg.util.ThreadPools; - -/** - * When constructing Flink Iceberg source via Java API, configs can be set in {@link Configuration} - * passed to source builder. E.g. - * - *

    - *   configuration.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, true);
    - *   FlinkSource.forRowData()
    - *       .flinkConf(configuration)
    - *       ...
    - * 
    - * - *

    When using Flink SQL/table API, connector options can be set in Flink's {@link - * TableEnvironment}. - * - *

    - *   TableEnvironment tEnv = createTableEnv();
    - *   tEnv.getConfig()
    - *        .getConfiguration()
    - *        .setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, true);
    - * 
    - */ -public class FlinkConfigOptions { - - private FlinkConfigOptions() {} - - public static final ConfigOption TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM = - ConfigOptions.key("table.exec.iceberg.infer-source-parallelism") - .booleanType() - .defaultValue(true) - .withDescription( - "If is false, parallelism of source are set by config.\n" - + "If is true, source parallelism is inferred according to splits number.\n"); - - public static final ConfigOption TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX = - ConfigOptions.key("table.exec.iceberg.infer-source-parallelism.max") - .intType() - .defaultValue(100) - .withDescription("Sets max infer parallelism for source operator."); - - public static final ConfigOption TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO = - ConfigOptions.key("table.exec.iceberg.expose-split-locality-info") - .booleanType() - .noDefaultValue() - .withDescription( - "Expose split host information to use Flink's locality aware split assigner."); - - public static final ConfigOption SOURCE_READER_FETCH_BATCH_RECORD_COUNT = - ConfigOptions.key("table.exec.iceberg.fetch-batch-record-count") - .intType() - .defaultValue(2048) - .withDescription("The target number of records for Iceberg reader fetch batch."); - - public static final ConfigOption TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE = - ConfigOptions.key("table.exec.iceberg.worker-pool-size") - .intType() - .defaultValue(ThreadPools.WORKER_THREAD_POOL_SIZE) - .withDescription("The size of workers pool used to plan or scan manifests."); - - public static final ConfigOption TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE = - ConfigOptions.key("table.exec.iceberg.use-flip27-source") - .booleanType() - .defaultValue(false) - .withDescription("Use the FLIP-27 based Iceberg source implementation."); - - public static final ConfigOption TABLE_EXEC_ICEBERG_USE_V2_SINK = - ConfigOptions.key("table.exec.iceberg.use-v2-sink") - .booleanType() - .defaultValue(false) - .withDescription("Use the SinkV2 API based Iceberg sink implementation."); - - public static final ConfigOption TABLE_EXEC_SPLIT_ASSIGNER_TYPE = - ConfigOptions.key("table.exec.iceberg.split-assigner-type") - .enumType(SplitAssignerType.class) - .defaultValue(SplitAssignerType.SIMPLE) - .withDescription( - Description.builder() - .text("Split assigner type that determine how splits are assigned to readers.") - .linebreak() - .list( - TextElement.text( - SplitAssignerType.SIMPLE - + ": simple assigner that doesn't provide any guarantee on order or locality.")) - .build()); -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCreateTableOptions.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCreateTableOptions.java deleted file mode 100644 index ab69ec5adc7f..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCreateTableOptions.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.Map; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.iceberg.util.JsonUtil; - -class FlinkCreateTableOptions { - private final String catalogName; - private final String catalogDb; - private final String catalogTable; - private final Map catalogProps; - - private FlinkCreateTableOptions( - String catalogName, String catalogDb, String catalogTable, Map props) { - this.catalogName = catalogName; - this.catalogDb = catalogDb; - this.catalogTable = catalogTable; - this.catalogProps = props; - } - - public static final ConfigOption CATALOG_NAME = - ConfigOptions.key("catalog-name") - .stringType() - .noDefaultValue() - .withDescription("Catalog name"); - - public static final ConfigOption CATALOG_TYPE = - ConfigOptions.key(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE) - .stringType() - .noDefaultValue() - .withDescription("Catalog type, the optional types are: custom, hadoop, hive."); - - public static final ConfigOption CATALOG_DATABASE = - ConfigOptions.key("catalog-database") - .stringType() - .defaultValue(FlinkCatalogFactory.DEFAULT_DATABASE_NAME) - .withDescription("Database name managed in the iceberg catalog."); - - public static final ConfigOption CATALOG_TABLE = - ConfigOptions.key("catalog-table") - .stringType() - .noDefaultValue() - .withDescription("Table name managed in the underlying iceberg catalog and database."); - - public static final ConfigOption> CATALOG_PROPS = - ConfigOptions.key("catalog-props") - .mapType() - .noDefaultValue() - .withDescription("Properties for the underlying catalog for iceberg table."); - - public static final String SRC_CATALOG_PROPS_KEY = "src-catalog"; - public static final String CONNECTOR_PROPS_KEY = "connector"; - public static final String LOCATION_KEY = "location"; - - static String toJson( - String catalogName, String catalogDb, String catalogTable, Map catalogProps) { - return JsonUtil.generate( - gen -> { - gen.writeStartObject(); - gen.writeStringField(CATALOG_NAME.key(), catalogName); - gen.writeStringField(CATALOG_DATABASE.key(), catalogDb); - gen.writeStringField(CATALOG_TABLE.key(), catalogTable); - JsonUtil.writeStringMap(CATALOG_PROPS.key(), catalogProps, gen); - gen.writeEndObject(); - }, - false); - } - - static FlinkCreateTableOptions fromJson(String createTableOptions) { - return JsonUtil.parse( - createTableOptions, - node -> { - String catalogName = JsonUtil.getString(CATALOG_NAME.key(), node); - String catalogDb = JsonUtil.getString(CATALOG_DATABASE.key(), node); - String catalogTable = JsonUtil.getString(CATALOG_TABLE.key(), node); - Map catalogProps = JsonUtil.getStringMap(CATALOG_PROPS.key(), node); - - return new FlinkCreateTableOptions(catalogName, catalogDb, catalogTable, catalogProps); - }); - } - - String catalogName() { - return catalogName; - } - - String catalogDb() { - return catalogDb; - } - - String catalogTable() { - return catalogTable; - } - - Map catalogProps() { - return catalogProps; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java deleted file mode 100644 index 3b9cc0baff11..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.Collections; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.catalog.CatalogDatabaseImpl; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.catalog.ObjectIdentifier; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.catalog.ResolvedCatalogTable; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; -import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; -import org.apache.flink.table.connector.sink.DynamicTableSink; -import org.apache.flink.table.connector.source.DynamicTableSource; -import org.apache.flink.table.factories.DynamicTableSinkFactory; -import org.apache.flink.table.factories.DynamicTableSourceFactory; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.exceptions.AlreadyExistsException; -import org.apache.iceberg.flink.source.IcebergTableSource; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; - -public class FlinkDynamicTableFactory - implements DynamicTableSinkFactory, DynamicTableSourceFactory { - static final String FACTORY_IDENTIFIER = "iceberg"; - private final FlinkCatalog catalog; - - public FlinkDynamicTableFactory() { - this.catalog = null; - } - - public FlinkDynamicTableFactory(FlinkCatalog catalog) { - this.catalog = catalog; - } - - @Override - public DynamicTableSource createDynamicTableSource(Context context) { - ObjectIdentifier objectIdentifier = context.getObjectIdentifier(); - ResolvedCatalogTable resolvedCatalogTable = context.getCatalogTable(); - Map tableProps = resolvedCatalogTable.getOptions(); - ResolvedSchema resolvedSchema = - ResolvedSchema.of( - resolvedCatalogTable.getResolvedSchema().getColumns().stream() - .filter(Column::isPhysical) - .collect(Collectors.toList())); - - TableLoader tableLoader; - if (catalog != null) { - tableLoader = createTableLoader(catalog, objectIdentifier.toObjectPath()); - } else { - tableLoader = - createTableLoader( - resolvedCatalogTable, - tableProps, - objectIdentifier.getDatabaseName(), - objectIdentifier.getObjectName()); - } - - return new IcebergTableSource( - tableLoader, resolvedSchema, tableProps, context.getConfiguration()); - } - - @Override - public DynamicTableSink createDynamicTableSink(Context context) { - ObjectIdentifier objectIdentifier = context.getObjectIdentifier(); - ResolvedCatalogTable resolvedCatalogTable = context.getCatalogTable(); - Map writeProps = resolvedCatalogTable.getOptions(); - ResolvedSchema resolvedSchema = - ResolvedSchema.of( - resolvedCatalogTable.getResolvedSchema().getColumns().stream() - .filter(Column::isPhysical) - .collect(Collectors.toList())); - - TableLoader tableLoader; - if (catalog != null) { - tableLoader = createTableLoader(catalog, objectIdentifier.toObjectPath()); - } else { - tableLoader = - createTableLoader( - resolvedCatalogTable, - writeProps, - objectIdentifier.getDatabaseName(), - objectIdentifier.getObjectName()); - } - - return new IcebergTableSink( - tableLoader, resolvedSchema, context.getConfiguration(), writeProps); - } - - @Override - public Set> requiredOptions() { - Set> options = Sets.newHashSet(); - options.add(FlinkCreateTableOptions.CATALOG_TYPE); - options.add(FlinkCreateTableOptions.CATALOG_NAME); - return options; - } - - @Override - public Set> optionalOptions() { - Set> options = Sets.newHashSet(); - options.add(FlinkCreateTableOptions.CATALOG_DATABASE); - options.add(FlinkCreateTableOptions.CATALOG_TABLE); - return options; - } - - @Override - public String factoryIdentifier() { - return FACTORY_IDENTIFIER; - } - - private static TableLoader createTableLoader( - ResolvedCatalogTable resolvedCatalogTable, - Map tableProps, - String databaseName, - String tableName) { - Configuration flinkConf = new Configuration(); - - Map mergedProps = mergeSrcCatalogProps(tableProps); - - mergedProps.forEach(flinkConf::setString); - - String catalogName = flinkConf.getString(FlinkCreateTableOptions.CATALOG_NAME); - Preconditions.checkNotNull( - catalogName, - "Table property '%s' cannot be null", - FlinkCreateTableOptions.CATALOG_NAME.key()); - - String catalogDatabase = - flinkConf.getString(FlinkCreateTableOptions.CATALOG_DATABASE, databaseName); - Preconditions.checkNotNull(catalogDatabase, "The iceberg database name cannot be null"); - - String catalogTable = flinkConf.getString(FlinkCreateTableOptions.CATALOG_TABLE, tableName); - Preconditions.checkNotNull(catalogTable, "The iceberg table name cannot be null"); - - org.apache.hadoop.conf.Configuration hadoopConf = FlinkCatalogFactory.clusterHadoopConf(); - FlinkCatalogFactory factory = new FlinkCatalogFactory(); - FlinkCatalog flinkCatalog = - (FlinkCatalog) factory.createCatalog(catalogName, mergedProps, hadoopConf); - ObjectPath objectPath = new ObjectPath(catalogDatabase, catalogTable); - - // Create database if not exists in the external catalog. - if (!flinkCatalog.databaseExists(catalogDatabase)) { - try { - flinkCatalog.createDatabase( - catalogDatabase, new CatalogDatabaseImpl(Maps.newHashMap(), null), true); - } catch (DatabaseAlreadyExistException e) { - throw new AlreadyExistsException( - e, - "Database %s already exists in the iceberg catalog %s.", - catalogName, - catalogDatabase); - } - } - - // Create table if not exists in the external catalog. - if (!flinkCatalog.tableExists(objectPath)) { - try { - flinkCatalog.createIcebergTable(objectPath, resolvedCatalogTable, true); - } catch (TableAlreadyExistException e) { - throw new AlreadyExistsException( - e, - "Table %s already exists in the database %s and catalog %s", - catalogTable, - catalogDatabase, - catalogName); - } - } - - return TableLoader.fromCatalog( - flinkCatalog.getCatalogLoader(), TableIdentifier.of(catalogDatabase, catalogTable)); - } - - /** - * Merges source catalog properties with connector properties. Iceberg Catalog properties are - * serialized as json in FlinkCatalog#getTable to be able to isolate catalog props from iceberg - * table props, Here, we flatten and merge them back to use to create catalog. - * - * @param tableProps the existing table properties - * @return a map of merged properties, with source catalog properties taking precedence when keys - * conflict - */ - private static Map mergeSrcCatalogProps(Map tableProps) { - String srcCatalogProps = tableProps.get(FlinkCreateTableOptions.SRC_CATALOG_PROPS_KEY); - if (srcCatalogProps != null) { - Map mergedProps = Maps.newHashMap(); - FlinkCreateTableOptions createTableOptions = - FlinkCreateTableOptions.fromJson(srcCatalogProps); - - mergedProps.put(FlinkCreateTableOptions.CATALOG_NAME.key(), createTableOptions.catalogName()); - mergedProps.put( - FlinkCreateTableOptions.CATALOG_DATABASE.key(), createTableOptions.catalogDb()); - mergedProps.put( - FlinkCreateTableOptions.CATALOG_TABLE.key(), createTableOptions.catalogTable()); - mergedProps.putAll(createTableOptions.catalogProps()); - - tableProps.forEach( - (k, v) -> { - if (!FlinkCreateTableOptions.SRC_CATALOG_PROPS_KEY.equals(k)) { - mergedProps.put(k, v); - } - }); - - return Collections.unmodifiableMap(mergedProps); - } - - return tableProps; - } - - private static TableLoader createTableLoader(FlinkCatalog catalog, ObjectPath objectPath) { - Preconditions.checkNotNull(catalog, "Flink catalog cannot be null"); - return TableLoader.fromCatalog(catalog.getCatalogLoader(), catalog.toIdentifier(objectPath)); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java deleted file mode 100644 index f35bb577fbba..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.iceberg.EnvironmentContext; -import org.apache.iceberg.flink.util.FlinkPackage; - -class FlinkEnvironmentContext { - private FlinkEnvironmentContext() {} - - public static void init() { - EnvironmentContext.put(EnvironmentContext.ENGINE_NAME, "flink"); - EnvironmentContext.put(EnvironmentContext.ENGINE_VERSION, FlinkPackage.version()); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java deleted file mode 100644 index f2244d5137a1..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.time.Instant; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.function.BiFunction; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import org.apache.flink.table.expressions.CallExpression; -import org.apache.flink.table.expressions.FieldReferenceExpression; -import org.apache.flink.table.expressions.ResolvedExpression; -import org.apache.flink.table.expressions.ValueLiteralExpression; -import org.apache.flink.table.functions.BuiltInFunctionDefinitions; -import org.apache.flink.table.functions.FunctionDefinition; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expression.Operation; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.util.DateTimeUtil; -import org.apache.iceberg.util.NaNUtil; - -public class FlinkFilters { - private FlinkFilters() {} - - private static final Pattern STARTS_WITH_PATTERN = Pattern.compile("([^%]+)%"); - - private static final Map FILTERS = - ImmutableMap.builder() - .put(BuiltInFunctionDefinitions.EQUALS, Operation.EQ) - .put(BuiltInFunctionDefinitions.NOT_EQUALS, Operation.NOT_EQ) - .put(BuiltInFunctionDefinitions.GREATER_THAN, Operation.GT) - .put(BuiltInFunctionDefinitions.GREATER_THAN_OR_EQUAL, Operation.GT_EQ) - .put(BuiltInFunctionDefinitions.LESS_THAN, Operation.LT) - .put(BuiltInFunctionDefinitions.LESS_THAN_OR_EQUAL, Operation.LT_EQ) - .put(BuiltInFunctionDefinitions.IS_NULL, Operation.IS_NULL) - .put(BuiltInFunctionDefinitions.IS_NOT_NULL, Operation.NOT_NULL) - .put(BuiltInFunctionDefinitions.AND, Operation.AND) - .put(BuiltInFunctionDefinitions.OR, Operation.OR) - .put(BuiltInFunctionDefinitions.NOT, Operation.NOT) - .put(BuiltInFunctionDefinitions.LIKE, Operation.STARTS_WITH) - .buildOrThrow(); - - /** - * Convert flink expression to iceberg expression. - * - *

    the BETWEEN, NOT_BETWEEN, IN expression will be converted by flink automatically. the - * BETWEEN will be converted to (GT_EQ AND LT_EQ), the NOT_BETWEEN will be converted to (LT_EQ OR - * GT_EQ), the IN will be converted to OR, so we do not add the conversion here - * - * @param flinkExpression the flink expression - * @return the iceberg expression - */ - public static Optional convert( - org.apache.flink.table.expressions.Expression flinkExpression) { - if (!(flinkExpression instanceof CallExpression)) { - return Optional.empty(); - } - - CallExpression call = (CallExpression) flinkExpression; - Operation op = FILTERS.get(call.getFunctionDefinition()); - if (op != null) { - switch (op) { - case IS_NULL: - return onlyChildAs(call, FieldReferenceExpression.class) - .map(FieldReferenceExpression::getName) - .map(Expressions::isNull); - - case NOT_NULL: - return onlyChildAs(call, FieldReferenceExpression.class) - .map(FieldReferenceExpression::getName) - .map(Expressions::notNull); - - case LT: - return convertFieldAndLiteral(Expressions::lessThan, Expressions::greaterThan, call); - - case LT_EQ: - return convertFieldAndLiteral( - Expressions::lessThanOrEqual, Expressions::greaterThanOrEqual, call); - - case GT: - return convertFieldAndLiteral(Expressions::greaterThan, Expressions::lessThan, call); - - case GT_EQ: - return convertFieldAndLiteral( - Expressions::greaterThanOrEqual, Expressions::lessThanOrEqual, call); - - case EQ: - return convertFieldAndLiteral( - (ref, lit) -> { - if (NaNUtil.isNaN(lit)) { - return Expressions.isNaN(ref); - } else { - return Expressions.equal(ref, lit); - } - }, - call); - - case NOT_EQ: - return convertFieldAndLiteral( - (ref, lit) -> { - if (NaNUtil.isNaN(lit)) { - return Expressions.notNaN(ref); - } else { - return Expressions.notEqual(ref, lit); - } - }, - call); - - case NOT: - return onlyChildAs(call, CallExpression.class) - .flatMap(FlinkFilters::convert) - .map(Expressions::not); - - case AND: - return convertLogicExpression(Expressions::and, call); - - case OR: - return convertLogicExpression(Expressions::or, call); - - case STARTS_WITH: - return convertLike(call); - } - } - - return Optional.empty(); - } - - private static Optional onlyChildAs( - CallExpression call, Class expectedChildClass) { - List children = call.getResolvedChildren(); - if (children.size() != 1) { - return Optional.empty(); - } - - ResolvedExpression child = children.get(0); - if (!expectedChildClass.isInstance(child)) { - return Optional.empty(); - } - - return Optional.of(expectedChildClass.cast(child)); - } - - private static Optional convertLike(CallExpression call) { - List args = call.getResolvedChildren(); - if (args.size() != 2) { - return Optional.empty(); - } - - org.apache.flink.table.expressions.Expression left = args.get(0); - org.apache.flink.table.expressions.Expression right = args.get(1); - - if (left instanceof FieldReferenceExpression && right instanceof ValueLiteralExpression) { - String name = ((FieldReferenceExpression) left).getName(); - return convertLiteral((ValueLiteralExpression) right) - .flatMap( - lit -> { - if (lit instanceof String) { - String pattern = (String) lit; - Matcher matcher = STARTS_WITH_PATTERN.matcher(pattern); - // exclude special char of LIKE - // '_' is the wildcard of the SQL LIKE - if (!pattern.contains("_") && matcher.matches()) { - return Optional.of(Expressions.startsWith(name, matcher.group(1))); - } - } - - return Optional.empty(); - }); - } - - return Optional.empty(); - } - - private static Optional convertLogicExpression( - BiFunction function, CallExpression call) { - List args = call.getResolvedChildren(); - if (args == null || args.size() != 2) { - return Optional.empty(); - } - - Optional left = convert(args.get(0)); - Optional right = convert(args.get(1)); - if (left.isPresent() && right.isPresent()) { - return Optional.of(function.apply(left.get(), right.get())); - } - - return Optional.empty(); - } - - private static Optional convertLiteral(ValueLiteralExpression expression) { - Optional value = - expression.getValueAs( - expression.getOutputDataType().getLogicalType().getDefaultConversion()); - return value.map( - o -> { - if (o instanceof LocalDateTime) { - return DateTimeUtil.microsFromTimestamp((LocalDateTime) o); - } else if (o instanceof Instant) { - return DateTimeUtil.microsFromInstant((Instant) o); - } else if (o instanceof LocalTime) { - return DateTimeUtil.microsFromTime((LocalTime) o); - } else if (o instanceof LocalDate) { - return DateTimeUtil.daysFromDate((LocalDate) o); - } - - return o; - }); - } - - private static Optional convertFieldAndLiteral( - BiFunction expr, CallExpression call) { - return convertFieldAndLiteral(expr, expr, call); - } - - private static Optional convertFieldAndLiteral( - BiFunction convertLR, - BiFunction convertRL, - CallExpression call) { - List args = call.getResolvedChildren(); - if (args.size() != 2) { - return Optional.empty(); - } - - org.apache.flink.table.expressions.Expression left = args.get(0); - org.apache.flink.table.expressions.Expression right = args.get(1); - - if (left instanceof FieldReferenceExpression && right instanceof ValueLiteralExpression) { - String name = ((FieldReferenceExpression) left).getName(); - Optional lit = convertLiteral((ValueLiteralExpression) right); - if (lit.isPresent()) { - return Optional.of(convertLR.apply(name, lit.get())); - } - } else if (left instanceof ValueLiteralExpression - && right instanceof FieldReferenceExpression) { - Optional lit = convertLiteral((ValueLiteralExpression) left); - String name = ((FieldReferenceExpression) right).getName(); - if (lit.isPresent()) { - return Optional.of(convertRL.apply(name, lit.get())); - } - } - - return Optional.empty(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java deleted file mode 100644 index 767d4497ac91..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.iceberg.Schema; -import org.apache.iceberg.types.FixupTypes; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; - -/** - * The uuid and fixed are converted to the same Flink type. Conversion back can produce only one, - * which may not be correct. - */ -class FlinkFixupTypes extends FixupTypes { - - private FlinkFixupTypes(Schema referenceSchema) { - super(referenceSchema); - } - - static Schema fixup(Schema schema, Schema referenceSchema) { - return new Schema( - TypeUtil.visit(schema, new FlinkFixupTypes(referenceSchema)).asStructType().fields()); - } - - @Override - protected boolean fixupPrimitive(Type.PrimitiveType type, Type source) { - if (type instanceof Types.FixedType) { - int length = ((Types.FixedType) type).length(); - return source.typeId() == Type.TypeID.UUID && length == 16; - } - return false; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java deleted file mode 100644 index 804a956ec9b9..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.time.Duration; -import java.util.Map; -import java.util.concurrent.TimeUnit; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.util.TimeUtils; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.source.StreamingStartingStrategy; - -public class FlinkReadConf { - - private final FlinkConfParser confParser; - - public FlinkReadConf( - Table table, Map readOptions, ReadableConfig readableConfig) { - this.confParser = new FlinkConfParser(table, readOptions, readableConfig); - } - - public Long snapshotId() { - return confParser.longConf().option(FlinkReadOptions.SNAPSHOT_ID.key()).parseOptional(); - } - - public String tag() { - return confParser.stringConf().option(FlinkReadOptions.TAG.key()).parseOptional(); - } - - public String startTag() { - return confParser.stringConf().option(FlinkReadOptions.START_TAG.key()).parseOptional(); - } - - public String endTag() { - return confParser.stringConf().option(FlinkReadOptions.END_TAG.key()).parseOptional(); - } - - public String branch() { - return confParser.stringConf().option(FlinkReadOptions.BRANCH.key()).parseOptional(); - } - - public boolean caseSensitive() { - return confParser - .booleanConf() - .option(FlinkReadOptions.CASE_SENSITIVE) - .flinkConfig(FlinkReadOptions.CASE_SENSITIVE_OPTION) - .defaultValue(FlinkReadOptions.CASE_SENSITIVE_OPTION.defaultValue()) - .parse(); - } - - public Long asOfTimestamp() { - return confParser.longConf().option(FlinkReadOptions.AS_OF_TIMESTAMP.key()).parseOptional(); - } - - public StreamingStartingStrategy startingStrategy() { - return confParser - .enumConfParser(StreamingStartingStrategy.class) - .option(FlinkReadOptions.STARTING_STRATEGY) - .flinkConfig(FlinkReadOptions.STARTING_STRATEGY_OPTION) - .defaultValue(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) - .parse(); - } - - public Long startSnapshotTimestamp() { - return confParser - .longConf() - .option(FlinkReadOptions.START_SNAPSHOT_TIMESTAMP.key()) - .parseOptional(); - } - - public Long startSnapshotId() { - return confParser.longConf().option(FlinkReadOptions.START_SNAPSHOT_ID.key()).parseOptional(); - } - - public Long endSnapshotId() { - return confParser.longConf().option(FlinkReadOptions.END_SNAPSHOT_ID.key()).parseOptional(); - } - - public long splitSize() { - return confParser - .longConf() - .option(FlinkReadOptions.SPLIT_SIZE) - .flinkConfig(FlinkReadOptions.SPLIT_SIZE_OPTION) - .tableProperty(TableProperties.SPLIT_SIZE) - .defaultValue(TableProperties.SPLIT_SIZE_DEFAULT) - .parse(); - } - - public int splitLookback() { - return confParser - .intConf() - .option(FlinkReadOptions.SPLIT_LOOKBACK) - .flinkConfig(FlinkReadOptions.SPLIT_LOOKBACK_OPTION) - .tableProperty(TableProperties.SPLIT_LOOKBACK) - .defaultValue(TableProperties.SPLIT_LOOKBACK_DEFAULT) - .parse(); - } - - public long splitFileOpenCost() { - return confParser - .longConf() - .option(FlinkReadOptions.SPLIT_FILE_OPEN_COST) - .flinkConfig(FlinkReadOptions.SPLIT_FILE_OPEN_COST_OPTION) - .tableProperty(TableProperties.SPLIT_OPEN_FILE_COST) - .defaultValue(TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT) - .parse(); - } - - public boolean streaming() { - return confParser - .booleanConf() - .option(FlinkReadOptions.STREAMING) - .flinkConfig(FlinkReadOptions.STREAMING_OPTION) - .defaultValue(FlinkReadOptions.STREAMING_OPTION.defaultValue()) - .parse(); - } - - public Duration monitorInterval() { - String duration = - confParser - .stringConf() - .option(FlinkReadOptions.MONITOR_INTERVAL) - .flinkConfig(FlinkReadOptions.MONITOR_INTERVAL_OPTION) - .defaultValue(FlinkReadOptions.MONITOR_INTERVAL_OPTION.defaultValue()) - .parse(); - - return TimeUtils.parseDuration(duration); - } - - public boolean includeColumnStats() { - return confParser - .booleanConf() - .option(FlinkReadOptions.INCLUDE_COLUMN_STATS) - .flinkConfig(FlinkReadOptions.INCLUDE_COLUMN_STATS_OPTION) - .defaultValue(FlinkReadOptions.INCLUDE_COLUMN_STATS_OPTION.defaultValue()) - .parse(); - } - - public int maxPlanningSnapshotCount() { - return confParser - .intConf() - .option(FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT) - .flinkConfig(FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT_OPTION) - .defaultValue(FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT_OPTION.defaultValue()) - .parse(); - } - - public String nameMapping() { - return confParser.stringConf().option(TableProperties.DEFAULT_NAME_MAPPING).parseOptional(); - } - - public long limit() { - return confParser - .longConf() - .option(FlinkReadOptions.LIMIT) - .flinkConfig(FlinkReadOptions.LIMIT_OPTION) - .defaultValue(FlinkReadOptions.LIMIT_OPTION.defaultValue()) - .parse(); - } - - public int workerPoolSize() { - return confParser - .intConf() - .option(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.key()) - .flinkConfig(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE) - .defaultValue(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue()) - .parse(); - } - - public int maxAllowedPlanningFailures() { - return confParser - .intConf() - .option(FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES) - .flinkConfig(FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION) - .defaultValue(FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION.defaultValue()) - .parse(); - } - - public String watermarkColumn() { - return confParser - .stringConf() - .option(FlinkReadOptions.WATERMARK_COLUMN) - .flinkConfig(FlinkReadOptions.WATERMARK_COLUMN_OPTION) - .defaultValue(FlinkReadOptions.WATERMARK_COLUMN_OPTION.defaultValue()) - .parseOptional(); - } - - public TimeUnit watermarkColumnTimeUnit() { - return confParser - .enumConfParser(TimeUnit.class) - .option(FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT) - .flinkConfig(FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT_OPTION) - .defaultValue(FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT_OPTION.defaultValue()) - .parse(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java deleted file mode 100644 index 1bbd88146c8f..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.concurrent.TimeUnit; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.source.StreamingStartingStrategy; - -/** Flink source read options */ -public class FlinkReadOptions { - private static final String PREFIX = "connector.iceberg."; - - private FlinkReadOptions() {} - - public static final ConfigOption SNAPSHOT_ID = - ConfigOptions.key("snapshot-id").longType().defaultValue(null); - - public static final ConfigOption TAG = - ConfigOptions.key("tag").stringType().defaultValue(null); - - public static final ConfigOption BRANCH = - ConfigOptions.key("branch").stringType().defaultValue(null); - - public static final ConfigOption START_TAG = - ConfigOptions.key("start-tag").stringType().defaultValue(null); - - public static final ConfigOption END_TAG = - ConfigOptions.key("end-tag").stringType().defaultValue(null); - - public static final String CASE_SENSITIVE = "case-sensitive"; - public static final ConfigOption CASE_SENSITIVE_OPTION = - ConfigOptions.key(PREFIX + CASE_SENSITIVE).booleanType().defaultValue(false); - - public static final ConfigOption AS_OF_TIMESTAMP = - ConfigOptions.key("as-of-timestamp").longType().defaultValue(null); - - public static final String STARTING_STRATEGY = "starting-strategy"; - public static final ConfigOption STARTING_STRATEGY_OPTION = - ConfigOptions.key(PREFIX + STARTING_STRATEGY) - .enumType(StreamingStartingStrategy.class) - .defaultValue(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT); - - public static final ConfigOption START_SNAPSHOT_TIMESTAMP = - ConfigOptions.key("start-snapshot-timestamp").longType().defaultValue(null); - - public static final ConfigOption START_SNAPSHOT_ID = - ConfigOptions.key("start-snapshot-id").longType().defaultValue(null); - - public static final ConfigOption END_SNAPSHOT_ID = - ConfigOptions.key("end-snapshot-id").longType().defaultValue(null); - - public static final String SPLIT_SIZE = "split-size"; - public static final ConfigOption SPLIT_SIZE_OPTION = - ConfigOptions.key(PREFIX + SPLIT_SIZE) - .longType() - .defaultValue(TableProperties.SPLIT_SIZE_DEFAULT); - - public static final String SPLIT_LOOKBACK = "split-lookback"; - public static final ConfigOption SPLIT_LOOKBACK_OPTION = - ConfigOptions.key(PREFIX + SPLIT_LOOKBACK) - .intType() - .defaultValue(TableProperties.SPLIT_LOOKBACK_DEFAULT); - - public static final String SPLIT_FILE_OPEN_COST = "split-file-open-cost"; - public static final ConfigOption SPLIT_FILE_OPEN_COST_OPTION = - ConfigOptions.key(PREFIX + SPLIT_FILE_OPEN_COST) - .longType() - .defaultValue(TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); - - public static final String STREAMING = "streaming"; - public static final ConfigOption STREAMING_OPTION = - ConfigOptions.key(PREFIX + STREAMING).booleanType().defaultValue(false); - - public static final String MONITOR_INTERVAL = "monitor-interval"; - public static final ConfigOption MONITOR_INTERVAL_OPTION = - ConfigOptions.key(PREFIX + MONITOR_INTERVAL).stringType().defaultValue("60s"); - - public static final String INCLUDE_COLUMN_STATS = "include-column-stats"; - public static final ConfigOption INCLUDE_COLUMN_STATS_OPTION = - ConfigOptions.key(PREFIX + INCLUDE_COLUMN_STATS).booleanType().defaultValue(false); - - public static final String MAX_PLANNING_SNAPSHOT_COUNT = "max-planning-snapshot-count"; - public static final ConfigOption MAX_PLANNING_SNAPSHOT_COUNT_OPTION = - ConfigOptions.key(PREFIX + MAX_PLANNING_SNAPSHOT_COUNT) - .intType() - .defaultValue(Integer.MAX_VALUE); - - public static final String LIMIT = "limit"; - public static final ConfigOption LIMIT_OPTION = - ConfigOptions.key(PREFIX + LIMIT).longType().defaultValue(-1L); - - public static final String MAX_ALLOWED_PLANNING_FAILURES = "max-allowed-planning-failures"; - public static final ConfigOption MAX_ALLOWED_PLANNING_FAILURES_OPTION = - ConfigOptions.key(PREFIX + MAX_ALLOWED_PLANNING_FAILURES).intType().defaultValue(3); - - public static final String WATERMARK_COLUMN = "watermark-column"; - public static final ConfigOption WATERMARK_COLUMN_OPTION = - ConfigOptions.key(PREFIX + WATERMARK_COLUMN).stringType().noDefaultValue(); - - public static final String WATERMARK_COLUMN_TIME_UNIT = "watermark-column-time-unit"; - public static final ConfigOption WATERMARK_COLUMN_TIME_UNIT_OPTION = - ConfigOptions.key(PREFIX + WATERMARK_COLUMN_TIME_UNIT) - .enumType(TimeUnit.class) - .defaultValue(TimeUnit.MICROSECONDS); -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkRowData.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkRowData.java deleted file mode 100644 index 0835795119f8..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkRowData.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.NullType; - -public class FlinkRowData { - - private FlinkRowData() {} - - public static RowData.FieldGetter createFieldGetter(LogicalType fieldType, int fieldPos) { - if (fieldType instanceof NullType) { - return rowData -> null; - } - - RowData.FieldGetter flinkFieldGetter = RowData.createFieldGetter(fieldType, fieldPos); - return rowData -> { - // Be sure to check for null values, even if the field is required. Flink - // RowData.createFieldGetter(..) does not null-check optional / nullable types. Without this - // explicit null check, the null flag of BinaryRowData will be ignored and random bytes will - // be parsed as actual values. This will produce incorrect writes instead of failing with a - // NullPointerException. - if (!fieldType.isNullable() && rowData.isNullAt(fieldPos)) { - return null; - } - return flinkFieldGetter.getFieldOrNull(rowData); - }; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java deleted file mode 100644 index 8e13545cd91a..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java +++ /dev/null @@ -1,380 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.UUID; -import java.util.function.Function; -import java.util.stream.Collectors; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.api.ValidationException; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.catalog.UniqueConstraint; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; - -/** - * Converter between Flink types and Iceberg type. The conversion is not a 1:1 mapping that not - * allows back-and-forth conversion. So some information might get lost during the back-and-forth - * conversion. - * - *

    This inconsistent types: - * - *

      - *
    • map Iceberg UUID type to Flink BinaryType(16) - *
    • map Flink VarCharType(_) and CharType(_) to Iceberg String type - *
    • map Flink VarBinaryType(_) to Iceberg Binary type - *
    • map Flink TimeType(_) to Iceberg Time type (microseconds) - *
    • map Flink TimestampType(_) to Iceberg Timestamp without zone type (microseconds) - *
    • map Flink LocalZonedTimestampType(_) to Iceberg Timestamp with zone type (microseconds) - *
    • map Flink MultiSetType to Iceberg Map type(element, int) - *
    - * - *

    - */ -public class FlinkSchemaUtil { - - private FlinkSchemaUtil() {} - - /** - * @deprecated will be removed in 2.0.0; use {@link #convert(ResolvedSchema)} instead. - */ - @Deprecated - public static Schema convert(TableSchema schema) { - LogicalType schemaType = schema.toRowDataType().getLogicalType(); - Preconditions.checkArgument( - schemaType instanceof RowType, "Schema logical type should be row type."); - - RowType root = (RowType) schemaType; - Type converted = root.accept(new FlinkTypeToType(root)); - - Schema icebergSchema = new Schema(converted.asStructType().fields()); - if (schema.getPrimaryKey().isPresent()) { - return freshIdentifierFieldIds(icebergSchema, schema.getPrimaryKey().get().getColumns()); - } else { - return icebergSchema; - } - } - - /** Convert the flink table schema to apache iceberg schema with column comment. */ - public static Schema convert(ResolvedSchema flinkSchema) { - List tableColumns = flinkSchema.getColumns(); - // copy from org.apache.flink.table.api.Schema#toRowDataType - DataTypes.Field[] fields = - tableColumns.stream() - .map( - column -> { - if (column.getComment().isPresent()) { - return DataTypes.FIELD( - column.getName(), column.getDataType(), column.getComment().get()); - } else { - return DataTypes.FIELD(column.getName(), column.getDataType()); - } - }) - .toArray(DataTypes.Field[]::new); - - LogicalType schemaType = DataTypes.ROW(fields).notNull().getLogicalType(); - Preconditions.checkArgument( - schemaType instanceof RowType, "Schema logical type should be row type."); - - RowType root = (RowType) schemaType; - Type converted = root.accept(new FlinkTypeToType(root)); - Schema icebergSchema = new Schema(converted.asStructType().fields()); - return flinkSchema - .getPrimaryKey() - .map(pk -> freshIdentifierFieldIds(icebergSchema, pk.getColumns())) - .orElse(icebergSchema); - } - - private static Schema freshIdentifierFieldIds(Schema icebergSchema, List primaryKeys) { - // Locate the identifier field id list. - Set identifierFieldIds = Sets.newHashSet(); - for (String primaryKey : primaryKeys) { - Types.NestedField field = icebergSchema.findField(primaryKey); - Preconditions.checkNotNull( - field, - "Cannot find field ID for the primary key column %s in schema %s", - primaryKey, - icebergSchema); - identifierFieldIds.add(field.fieldId()); - } - return new Schema( - icebergSchema.schemaId(), icebergSchema.asStruct().fields(), identifierFieldIds); - } - - /** - * Convert a Flink {@link TableSchema} to a {@link Schema} based on the given schema. - * - *

    This conversion does not assign new ids; it uses ids from the base schema. - * - *

    Data types, field order, and nullability will match the Flink type. This conversion may - * return a schema that is not compatible with base schema. - * - * @param baseSchema a Schema on which conversion is based - * @param flinkSchema a Flink TableSchema - * @return the equivalent Schema - * @throws IllegalArgumentException if the type cannot be converted or there are missing ids - * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link #convert(Schema, - * ResolvedSchema)} instead. - */ - @Deprecated - public static Schema convert(Schema baseSchema, TableSchema flinkSchema) { - // convert to a type with fresh ids - Types.StructType struct = convert(flinkSchema).asStruct(); - // reassign ids to match the base schema - Schema schema = TypeUtil.reassignIds(new Schema(struct.fields()), baseSchema); - // reassign doc to match the base schema - schema = TypeUtil.reassignDoc(schema, baseSchema); - - // fix types that can't be represented in Flink (UUID) - Schema fixedSchema = FlinkFixupTypes.fixup(schema, baseSchema); - if (flinkSchema.getPrimaryKey().isPresent()) { - return freshIdentifierFieldIds(fixedSchema, flinkSchema.getPrimaryKey().get().getColumns()); - } else { - return fixedSchema; - } - } - - /** - * Convert a Flink {@link ResolvedSchema} to a {@link Schema} based on the given schema. - * - *

    This conversion does not assign new ids; it uses ids from the base schema. - * - *

    Data types, field order, and nullability will match the Flink type. This conversion may - * return a schema that is not compatible with base schema. - * - * @param baseSchema a Schema on which conversion is based - * @param flinkSchema a Flink ResolvedSchema - * @return the equivalent Schema - * @throws IllegalArgumentException if the type cannot be converted or there are missing ids - */ - public static Schema convert(Schema baseSchema, ResolvedSchema flinkSchema) { - // convert to a type with fresh ids - Types.StructType struct = convert(flinkSchema).asStruct(); - // reassign ids to match the base schema - Schema schema = TypeUtil.reassignIds(new Schema(struct.fields()), baseSchema); - // reassign doc to match the base schema - schema = TypeUtil.reassignDoc(schema, baseSchema); - - // fix types that can't be represented in Flink (UUID) - Schema fixedSchema = FlinkFixupTypes.fixup(schema, baseSchema); - return flinkSchema - .getPrimaryKey() - .map(pk -> freshIdentifierFieldIds(fixedSchema, pk.getColumns())) - .orElse(fixedSchema); - } - - /** - * Convert a {@link Schema} to a {@link RowType Flink type}. - * - * @param schema a Schema - * @return the equivalent Flink type - * @throws IllegalArgumentException if the type cannot be converted to Flink - */ - public static RowType convert(Schema schema) { - return (RowType) TypeUtil.visit(schema, new TypeToFlinkType()); - } - - /** - * Convert a {@link Type} to a {@link LogicalType Flink type}. - * - * @param type a Type - * @return the equivalent Flink type - * @throws IllegalArgumentException if the type cannot be converted to Flink - */ - public static LogicalType convert(Type type) { - return TypeUtil.visit(type, new TypeToFlinkType()); - } - - /** - * Convert a {@link LogicalType Flink type} to a {@link Type}. - * - * @param flinkType a FlinkType - * @return the equivalent Iceberg type - */ - public static Type convert(LogicalType flinkType) { - return flinkType.accept(new FlinkTypeToType()); - } - - /** - * Convert a {@link RowType} to a {@link TableSchema}. - * - * @param rowType a RowType - * @return Flink TableSchema - * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link #toResolvedSchema(RowType)} - * instead - */ - @Deprecated - public static TableSchema toSchema(RowType rowType) { - TableSchema.Builder builder = TableSchema.builder(); - for (RowType.RowField field : rowType.getFields()) { - builder.field(field.getName(), TypeConversions.fromLogicalToDataType(field.getType())); - } - return builder.build(); - } - - /** - * Convert a {@link RowType} to a {@link ResolvedSchema}. - * - * @param rowType a RowType - * @return Flink ResolvedSchema - */ - public static ResolvedSchema toResolvedSchema(RowType rowType) { - List columns = Lists.newArrayListWithExpectedSize(rowType.getFieldCount()); - for (RowType.RowField field : rowType.getFields()) { - columns.add( - Column.physical(field.getName(), TypeConversions.fromLogicalToDataType(field.getType()))); - } - - return ResolvedSchema.of(columns); - } - - /** - * Convert a {@link Schema} to a {@link TableSchema}. - * - * @param schema iceberg schema to convert. - * @return Flink TableSchema. - * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link #toResolvedSchema(Schema)} - * instead - */ - @Deprecated - public static TableSchema toSchema(Schema schema) { - TableSchema.Builder builder = TableSchema.builder(); - - // Add columns. - for (RowType.RowField field : convert(schema).getFields()) { - builder.field(field.getName(), TypeConversions.fromLogicalToDataType(field.getType())); - } - - // Add primary key. - Set identifierFieldIds = schema.identifierFieldIds(); - if (!identifierFieldIds.isEmpty()) { - List columns = Lists.newArrayListWithExpectedSize(identifierFieldIds.size()); - for (Integer identifierFieldId : identifierFieldIds) { - String columnName = schema.findColumnName(identifierFieldId); - Preconditions.checkNotNull( - columnName, "Cannot find field with id %s in schema %s", identifierFieldId, schema); - - columns.add(columnName); - } - builder.primaryKey(columns.toArray(new String[0])); - } - - return builder.build(); - } - - /** - * Convert a {@link Schema} to a {@link ResolvedSchema}. - * - * @param schema iceberg schema to convert. - * @return Flink ResolvedSchema. - */ - public static ResolvedSchema toResolvedSchema(Schema schema) { - RowType rowType = convert(schema); - List columns = Lists.newArrayListWithExpectedSize(rowType.getFieldCount()); - - // Add columns. - for (RowType.RowField field : rowType.getFields()) { - columns.add( - Column.physical(field.getName(), TypeConversions.fromLogicalToDataType(field.getType()))); - } - - // Add primary key. - Set identifierFieldIds = schema.identifierFieldIds(); - UniqueConstraint uniqueConstraint = null; - if (!identifierFieldIds.isEmpty()) { - List primaryKeyColumns = - Lists.newArrayListWithExpectedSize(identifierFieldIds.size()); - for (Integer identifierFieldId : identifierFieldIds) { - String columnName = schema.findColumnName(identifierFieldId); - Preconditions.checkNotNull( - columnName, "Cannot find field with id %s in schema %s", identifierFieldId, schema); - - primaryKeyColumns.add(columnName); - } - - uniqueConstraint = - UniqueConstraint.primaryKey(UUID.randomUUID().toString(), primaryKeyColumns); - - validatePrimaryKey(uniqueConstraint, columns); - } - - return new ResolvedSchema(columns, Collections.emptyList(), uniqueConstraint); - } - - /** - * Copied from - * org.apache.flink.table.catalog.DefaultSchemaResolver#validatePrimaryKey(org.apache.flink.table.catalog.UniqueConstraint, - * java.util.List) - */ - private static void validatePrimaryKey(UniqueConstraint primaryKey, List columns) { - final Map columnsByNameLookup = - columns.stream().collect(Collectors.toMap(Column::getName, Function.identity())); - - final Set duplicateColumns = - primaryKey.getColumns().stream() - .filter(name -> Collections.frequency(primaryKey.getColumns(), name) > 1) - .collect(Collectors.toSet()); - - if (!duplicateColumns.isEmpty()) { - throw new ValidationException( - String.format( - "Invalid primary key '%s'. A primary key must not contain duplicate columns. Found: %s", - primaryKey.getName(), duplicateColumns)); - } - - for (String columnName : primaryKey.getColumns()) { - Column column = columnsByNameLookup.get(columnName); - if (column == null) { - throw new ValidationException( - String.format( - "Invalid primary key '%s'. Column '%s' does not exist.", - primaryKey.getName(), columnName)); - } - - if (!column.isPhysical()) { - throw new ValidationException( - String.format( - "Invalid primary key '%s'. Column '%s' is not a physical column.", - primaryKey.getName(), columnName)); - } - - final LogicalType columnType = column.getDataType().getLogicalType(); - if (columnType.isNullable()) { - throw new ValidationException( - String.format( - "Invalid primary key '%s'. Column '%s' is nullable.", - primaryKey.getName(), columnName)); - } - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java deleted file mode 100644 index 5fbd84909d69..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.flink.api.common.functions.FilterFunction; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.expressions.Evaluator; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.types.Types; - -public class FlinkSourceFilter implements FilterFunction { - - private final RowType rowType; - private final Evaluator evaluator; - private final Types.StructType struct; - private volatile RowDataWrapper wrapper; - - public FlinkSourceFilter(Schema schema, Expression expr, boolean caseSensitive) { - this.rowType = FlinkSchemaUtil.convert(schema); - this.struct = schema.asStruct(); - this.evaluator = new Evaluator(struct, expr, caseSensitive); - } - - @Override - public boolean filter(RowData value) { - if (wrapper == null) { - this.wrapper = new RowDataWrapper(rowType, struct); - } - return evaluator.eval(wrapper.wrap(value)); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java deleted file mode 100644 index 408065f06057..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.BigIntType; -import org.apache.flink.table.types.logical.BinaryType; -import org.apache.flink.table.types.logical.BooleanType; -import org.apache.flink.table.types.logical.CharType; -import org.apache.flink.table.types.logical.DateType; -import org.apache.flink.table.types.logical.DecimalType; -import org.apache.flink.table.types.logical.DoubleType; -import org.apache.flink.table.types.logical.FloatType; -import org.apache.flink.table.types.logical.IntType; -import org.apache.flink.table.types.logical.LocalZonedTimestampType; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.MultisetType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.SmallIntType; -import org.apache.flink.table.types.logical.TimeType; -import org.apache.flink.table.types.logical.TimestampType; -import org.apache.flink.table.types.logical.TinyIntType; -import org.apache.flink.table.types.logical.VarBinaryType; -import org.apache.flink.table.types.logical.VarCharType; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; - -class FlinkTypeToType extends FlinkTypeVisitor { - - private final RowType root; - private int nextId; - - FlinkTypeToType() { - this.root = null; - } - - FlinkTypeToType(RowType root) { - this.root = root; - // the root struct's fields use the first ids - this.nextId = root.getFieldCount(); - } - - private int getNextId() { - int next = nextId; - nextId += 1; - return next; - } - - @Override - public Type visit(CharType charType) { - return Types.StringType.get(); - } - - @Override - public Type visit(VarCharType varCharType) { - return Types.StringType.get(); - } - - @Override - public Type visit(BooleanType booleanType) { - return Types.BooleanType.get(); - } - - @Override - public Type visit(BinaryType binaryType) { - return Types.FixedType.ofLength(binaryType.getLength()); - } - - @Override - public Type visit(VarBinaryType varBinaryType) { - return Types.BinaryType.get(); - } - - @Override - public Type visit(DecimalType decimalType) { - return Types.DecimalType.of(decimalType.getPrecision(), decimalType.getScale()); - } - - @Override - public Type visit(TinyIntType tinyIntType) { - return Types.IntegerType.get(); - } - - @Override - public Type visit(SmallIntType smallIntType) { - return Types.IntegerType.get(); - } - - @Override - public Type visit(IntType intType) { - return Types.IntegerType.get(); - } - - @Override - public Type visit(BigIntType bigIntType) { - return Types.LongType.get(); - } - - @Override - public Type visit(FloatType floatType) { - return Types.FloatType.get(); - } - - @Override - public Type visit(DoubleType doubleType) { - return Types.DoubleType.get(); - } - - @Override - public Type visit(DateType dateType) { - return Types.DateType.get(); - } - - @Override - public Type visit(TimeType timeType) { - return Types.TimeType.get(); - } - - @Override - public Type visit(TimestampType timestampType) { - return Types.TimestampType.withoutZone(); - } - - @Override - public Type visit(LocalZonedTimestampType localZonedTimestampType) { - return Types.TimestampType.withZone(); - } - - @Override - public Type visit(ArrayType arrayType) { - Type elementType = arrayType.getElementType().accept(this); - if (arrayType.getElementType().isNullable()) { - return Types.ListType.ofOptional(getNextId(), elementType); - } else { - return Types.ListType.ofRequired(getNextId(), elementType); - } - } - - @Override - public Type visit(MultisetType multisetType) { - Type elementType = multisetType.getElementType().accept(this); - return Types.MapType.ofRequired(getNextId(), getNextId(), elementType, Types.IntegerType.get()); - } - - @Override - public Type visit(MapType mapType) { - // keys in map are not allowed to be null. - Type keyType = mapType.getKeyType().accept(this); - Type valueType = mapType.getValueType().accept(this); - if (mapType.getValueType().isNullable()) { - return Types.MapType.ofOptional(getNextId(), getNextId(), keyType, valueType); - } else { - return Types.MapType.ofRequired(getNextId(), getNextId(), keyType, valueType); - } - } - - @Override - @SuppressWarnings("ReferenceEquality") - public Type visit(RowType rowType) { - List newFields = Lists.newArrayListWithExpectedSize(rowType.getFieldCount()); - boolean isRoot = root == rowType; - - List types = - rowType.getFields().stream() - .map(f -> f.getType().accept(this)) - .collect(Collectors.toList()); - - for (int i = 0; i < rowType.getFieldCount(); i++) { - int id = isRoot ? i : getNextId(); - - RowType.RowField field = rowType.getFields().get(i); - String name = field.getName(); - String comment = field.getDescription().orElse(null); - - if (field.getType().isNullable()) { - newFields.add(Types.NestedField.optional(id, name, types.get(i), comment)); - } else { - newFields.add(Types.NestedField.required(id, name, types.get(i), comment)); - } - } - - return Types.StructType.of(newFields); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java deleted file mode 100644 index f3de2416088c..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.flink.table.types.logical.DayTimeIntervalType; -import org.apache.flink.table.types.logical.DistinctType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.LogicalTypeVisitor; -import org.apache.flink.table.types.logical.NullType; -import org.apache.flink.table.types.logical.RawType; -import org.apache.flink.table.types.logical.StructuredType; -import org.apache.flink.table.types.logical.SymbolType; -import org.apache.flink.table.types.logical.YearMonthIntervalType; -import org.apache.flink.table.types.logical.ZonedTimestampType; - -public abstract class FlinkTypeVisitor implements LogicalTypeVisitor { - - // ------------------------- Unsupported types ------------------------------ - - @Override - public T visit(ZonedTimestampType zonedTimestampType) { - throw new UnsupportedOperationException("Unsupported ZonedTimestampType."); - } - - @Override - public T visit(YearMonthIntervalType yearMonthIntervalType) { - throw new UnsupportedOperationException("Unsupported YearMonthIntervalType."); - } - - @Override - public T visit(DayTimeIntervalType dayTimeIntervalType) { - throw new UnsupportedOperationException("Unsupported DayTimeIntervalType."); - } - - @Override - public T visit(DistinctType distinctType) { - throw new UnsupportedOperationException("Unsupported DistinctType."); - } - - @Override - public T visit(StructuredType structuredType) { - throw new UnsupportedOperationException("Unsupported StructuredType."); - } - - @Override - public T visit(NullType nullType) { - throw new UnsupportedOperationException("Unsupported NullType."); - } - - @Override - public T visit(RawType rawType) { - throw new UnsupportedOperationException("Unsupported RawType."); - } - - @Override - public T visit(SymbolType symbolType) { - throw new UnsupportedOperationException("Unsupported SymbolType."); - } - - @Override - public T visit(LogicalType other) { - throw new UnsupportedOperationException("Unsupported type: " + other); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java deleted file mode 100644 index 222a1e810468..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.time.Duration; -import java.util.Map; -import org.apache.flink.annotation.Experimental; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.sink.shuffle.StatisticsType; - -/** - * A class for common Iceberg configs for Flink writes. - * - *

    If a config is set at multiple levels, the following order of precedence is used (top to - * bottom): - * - *

      - *
    1. Write options - *
    2. flink ReadableConfig - *
    3. Table metadata - *
    - * - * The most specific value is set in write options and takes precedence over all other configs. If - * no write option is provided, this class checks the flink configuration for any overrides. If no - * applicable value is found in the write options, this class uses the table metadata. - * - *

    Note this class is NOT meant to be serialized. - */ -public class FlinkWriteConf { - - private final FlinkConfParser confParser; - - public FlinkWriteConf( - Table table, Map writeOptions, ReadableConfig readableConfig) { - this.confParser = new FlinkConfParser(table, writeOptions, readableConfig); - } - - public FlinkWriteConf(Map writeOptions, ReadableConfig readableConfig) { - this.confParser = new FlinkConfParser(writeOptions, readableConfig); - } - - public boolean overwriteMode() { - return confParser - .booleanConf() - .option(FlinkWriteOptions.OVERWRITE_MODE.key()) - .flinkConfig(FlinkWriteOptions.OVERWRITE_MODE) - .defaultValue(FlinkWriteOptions.OVERWRITE_MODE.defaultValue()) - .parse(); - } - - public boolean upsertMode() { - return confParser - .booleanConf() - .option(FlinkWriteOptions.WRITE_UPSERT_ENABLED.key()) - .flinkConfig(FlinkWriteOptions.WRITE_UPSERT_ENABLED) - .tableProperty(TableProperties.UPSERT_ENABLED) - .defaultValue(TableProperties.UPSERT_ENABLED_DEFAULT) - .parse(); - } - - public FileFormat dataFileFormat() { - String valueAsString = - confParser - .stringConf() - .option(FlinkWriteOptions.WRITE_FORMAT.key()) - .flinkConfig(FlinkWriteOptions.WRITE_FORMAT) - .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) - .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) - .parse(); - return FileFormat.fromString(valueAsString); - } - - public long targetDataFileSize() { - return confParser - .longConf() - .option(FlinkWriteOptions.TARGET_FILE_SIZE_BYTES.key()) - .flinkConfig(FlinkWriteOptions.TARGET_FILE_SIZE_BYTES) - .tableProperty(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES) - .defaultValue(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT) - .parse(); - } - - public String parquetCompressionCodec() { - return confParser - .stringConf() - .option(FlinkWriteOptions.COMPRESSION_CODEC.key()) - .flinkConfig(FlinkWriteOptions.COMPRESSION_CODEC) - .tableProperty(TableProperties.PARQUET_COMPRESSION) - .defaultValue(TableProperties.PARQUET_COMPRESSION_DEFAULT) - .parse(); - } - - public String parquetCompressionLevel() { - return confParser - .stringConf() - .option(FlinkWriteOptions.COMPRESSION_LEVEL.key()) - .flinkConfig(FlinkWriteOptions.COMPRESSION_LEVEL) - .tableProperty(TableProperties.PARQUET_COMPRESSION_LEVEL) - .defaultValue(TableProperties.PARQUET_COMPRESSION_LEVEL_DEFAULT) - .parseOptional(); - } - - public String avroCompressionCodec() { - return confParser - .stringConf() - .option(FlinkWriteOptions.COMPRESSION_CODEC.key()) - .flinkConfig(FlinkWriteOptions.COMPRESSION_CODEC) - .tableProperty(TableProperties.AVRO_COMPRESSION) - .defaultValue(TableProperties.AVRO_COMPRESSION_DEFAULT) - .parse(); - } - - public String avroCompressionLevel() { - return confParser - .stringConf() - .option(FlinkWriteOptions.COMPRESSION_LEVEL.key()) - .flinkConfig(FlinkWriteOptions.COMPRESSION_LEVEL) - .tableProperty(TableProperties.AVRO_COMPRESSION_LEVEL) - .defaultValue(TableProperties.AVRO_COMPRESSION_LEVEL_DEFAULT) - .parseOptional(); - } - - public String orcCompressionCodec() { - return confParser - .stringConf() - .option(FlinkWriteOptions.COMPRESSION_CODEC.key()) - .flinkConfig(FlinkWriteOptions.COMPRESSION_CODEC) - .tableProperty(TableProperties.ORC_COMPRESSION) - .defaultValue(TableProperties.ORC_COMPRESSION_DEFAULT) - .parse(); - } - - public String orcCompressionStrategy() { - return confParser - .stringConf() - .option(FlinkWriteOptions.COMPRESSION_STRATEGY.key()) - .flinkConfig(FlinkWriteOptions.COMPRESSION_STRATEGY) - .tableProperty(TableProperties.ORC_COMPRESSION_STRATEGY) - .defaultValue(TableProperties.ORC_COMPRESSION_STRATEGY_DEFAULT) - .parse(); - } - - public DistributionMode distributionMode() { - String modeName = - confParser - .stringConf() - .option(FlinkWriteOptions.DISTRIBUTION_MODE.key()) - .flinkConfig(FlinkWriteOptions.DISTRIBUTION_MODE) - .tableProperty(TableProperties.WRITE_DISTRIBUTION_MODE) - .defaultValue(TableProperties.WRITE_DISTRIBUTION_MODE_NONE) - .parse(); - return DistributionMode.fromName(modeName); - } - - public StatisticsType rangeDistributionStatisticsType() { - String name = - confParser - .stringConf() - .option(FlinkWriteOptions.RANGE_DISTRIBUTION_STATISTICS_TYPE.key()) - .flinkConfig(FlinkWriteOptions.RANGE_DISTRIBUTION_STATISTICS_TYPE) - .defaultValue(FlinkWriteOptions.RANGE_DISTRIBUTION_STATISTICS_TYPE.defaultValue()) - .parse(); - return StatisticsType.valueOf(name); - } - - public double rangeDistributionSortKeyBaseWeight() { - return confParser - .doubleConf() - .option(FlinkWriteOptions.RANGE_DISTRIBUTION_SORT_KEY_BASE_WEIGHT.key()) - .flinkConfig(FlinkWriteOptions.RANGE_DISTRIBUTION_SORT_KEY_BASE_WEIGHT) - .defaultValue(FlinkWriteOptions.RANGE_DISTRIBUTION_SORT_KEY_BASE_WEIGHT.defaultValue()) - .parse(); - } - - public int workerPoolSize() { - return confParser - .intConf() - .flinkConfig(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE) - .defaultValue(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue()) - .parse(); - } - - public String branch() { - return confParser - .stringConf() - .option(FlinkWriteOptions.BRANCH.key()) - .defaultValue(FlinkWriteOptions.BRANCH.defaultValue()) - .parse(); - } - - public Integer writeParallelism() { - return confParser.intConf().option(FlinkWriteOptions.WRITE_PARALLELISM.key()).parseOptional(); - } - - public boolean compactMode() { - return confParser - .booleanConf() - .option(FlinkWriteOptions.COMPACTION_ENABLE.key()) - .flinkConfig(FlinkWriteOptions.COMPACTION_ENABLE) - .defaultValue(FlinkWriteOptions.COMPACTION_ENABLE.defaultValue()) - .parse(); - } - - /** - * NOTE: This may be removed or changed in a future release. This value specifies the interval for - * refreshing the table instances in sink writer subtasks. If not specified then the default - * behavior is to not refresh the table. - * - * @return the interval for refreshing the table in sink writer subtasks - */ - @Experimental - public Duration tableRefreshInterval() { - return confParser - .durationConf() - .option(FlinkWriteOptions.TABLE_REFRESH_INTERVAL.key()) - .flinkConfig(FlinkWriteOptions.TABLE_REFRESH_INTERVAL) - .parseOptional(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java deleted file mode 100644 index 6bdb01c3f5d3..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.time.Duration; -import org.apache.flink.annotation.Experimental; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.flink.sink.shuffle.StatisticsType; - -/** Flink sink write options */ -public class FlinkWriteOptions { - - private FlinkWriteOptions() {} - - // File format for write operations(default: Table write.format.default ) - public static final ConfigOption WRITE_FORMAT = - ConfigOptions.key("write-format").stringType().noDefaultValue(); - - // Overrides this table's write.target-file-size-bytes - public static final ConfigOption TARGET_FILE_SIZE_BYTES = - ConfigOptions.key("target-file-size-bytes").longType().noDefaultValue(); - - // Overrides this table's write..compression-codec - public static final ConfigOption COMPRESSION_CODEC = - ConfigOptions.key("compression-codec").stringType().noDefaultValue(); - - // Overrides this table's write..compression-level - public static final ConfigOption COMPRESSION_LEVEL = - ConfigOptions.key("compression-level").stringType().noDefaultValue(); - - // Overrides this table's write..compression-strategy - public static final ConfigOption COMPRESSION_STRATEGY = - ConfigOptions.key("compression-strategy").stringType().noDefaultValue(); - - // Overrides this table's write.upsert.enabled - public static final ConfigOption WRITE_UPSERT_ENABLED = - ConfigOptions.key("upsert-enabled").booleanType().noDefaultValue(); - - public static final ConfigOption OVERWRITE_MODE = - ConfigOptions.key("overwrite-enabled").booleanType().defaultValue(false); - - // Overrides the table's write.distribution-mode - public static final ConfigOption DISTRIBUTION_MODE = - ConfigOptions.key("distribution-mode").stringType().noDefaultValue(); - - public static final ConfigOption RANGE_DISTRIBUTION_STATISTICS_TYPE = - ConfigOptions.key("range-distribution-statistics-type") - .stringType() - .defaultValue(StatisticsType.Auto.name()) - .withDescription("Type of statistics collection: Auto, Map, Sketch"); - - public static final ConfigOption RANGE_DISTRIBUTION_SORT_KEY_BASE_WEIGHT = - ConfigOptions.key("range-distribution-sort-key-base-weight") - .doubleType() - .defaultValue(0.0d) - .withDescription( - "Base weight for every sort key relative to target weight per writer task"); - - // Branch to write to - public static final ConfigOption BRANCH = - ConfigOptions.key("branch").stringType().defaultValue(SnapshotRef.MAIN_BRANCH); - - public static final ConfigOption WRITE_PARALLELISM = - ConfigOptions.key("write-parallelism").intType().noDefaultValue(); - - public static final ConfigOption COMPACTION_ENABLE = - ConfigOptions.key("compaction-enabled").booleanType().defaultValue(false); - - @Experimental - public static final ConfigOption TABLE_REFRESH_INTERVAL = - ConfigOptions.key("table-refresh-interval").durationType().noDefaultValue(); -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java deleted file mode 100644 index 218e298c9583..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.List; -import java.util.Map; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.DataStreamSink; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.catalog.UniqueConstraint; -import org.apache.flink.table.connector.ChangelogMode; -import org.apache.flink.table.connector.ProviderContext; -import org.apache.flink.table.connector.sink.DataStreamSinkProvider; -import org.apache.flink.table.connector.sink.DynamicTableSink; -import org.apache.flink.table.connector.sink.abilities.SupportsOverwrite; -import org.apache.flink.table.connector.sink.abilities.SupportsPartitioning; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.flink.sink.FlinkSink; -import org.apache.iceberg.flink.sink.IcebergSink; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; - -public class IcebergTableSink implements DynamicTableSink, SupportsPartitioning, SupportsOverwrite { - private final TableLoader tableLoader; - @Deprecated private final TableSchema tableSchema; - private final ResolvedSchema resolvedSchema; - private final ReadableConfig readableConfig; - private final Map writeProps; - - private boolean overwrite = false; - - private IcebergTableSink(IcebergTableSink toCopy) { - this.tableLoader = toCopy.tableLoader; - this.tableSchema = toCopy.tableSchema; - this.resolvedSchema = toCopy.resolvedSchema; - this.overwrite = toCopy.overwrite; - this.readableConfig = toCopy.readableConfig; - this.writeProps = toCopy.writeProps; - } - - /** - * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link #IcebergTableSink(TableLoader, - * ResolvedSchema, ReadableConfig, Map)} instead - */ - @Deprecated - public IcebergTableSink( - TableLoader tableLoader, - TableSchema tableSchema, - ReadableConfig readableConfig, - Map writeProps) { - this.tableLoader = tableLoader; - this.tableSchema = tableSchema; - this.resolvedSchema = null; - this.readableConfig = readableConfig; - this.writeProps = writeProps; - } - - public IcebergTableSink( - TableLoader tableLoader, - ResolvedSchema resolvedSchema, - ReadableConfig readableConfig, - Map writeProps) { - this.tableLoader = tableLoader; - this.tableSchema = null; - this.resolvedSchema = resolvedSchema; - this.readableConfig = readableConfig; - this.writeProps = writeProps; - } - - @Override - public SinkRuntimeProvider getSinkRuntimeProvider(Context context) { - Preconditions.checkState( - !overwrite || context.isBounded(), - "Unbounded data stream doesn't support overwrite operation."); - - if (resolvedSchema != null) { - List equalityColumns = - resolvedSchema - .getPrimaryKey() - .map(UniqueConstraint::getColumns) - .orElseGet(ImmutableList::of); - - return new DataStreamSinkProvider() { - @Override - public DataStreamSink consumeDataStream( - ProviderContext providerContext, DataStream dataStream) { - if (Boolean.TRUE.equals( - readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_V2_SINK))) { - return IcebergSink.forRowData(dataStream) - .tableLoader(tableLoader) - .resolvedSchema(resolvedSchema) - .equalityFieldColumns(equalityColumns) - .overwrite(overwrite) - .setAll(writeProps) - .flinkConf(readableConfig) - .append(); - } else { - return FlinkSink.forRowData(dataStream) - .tableLoader(tableLoader) - .resolvedSchema(resolvedSchema) - .equalityFieldColumns(equalityColumns) - .overwrite(overwrite) - .setAll(writeProps) - .flinkConf(readableConfig) - .append(); - } - } - }; - } else { - List equalityColumns = - tableSchema - .getPrimaryKey() - .map(org.apache.flink.table.api.constraints.UniqueConstraint::getColumns) - .orElseGet(ImmutableList::of); - - return new DataStreamSinkProvider() { - @Override - public DataStreamSink consumeDataStream( - ProviderContext providerContext, DataStream dataStream) { - if (readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_V2_SINK)) { - return IcebergSink.forRowData(dataStream) - .tableLoader(tableLoader) - .tableSchema(tableSchema) - .equalityFieldColumns(equalityColumns) - .overwrite(overwrite) - .setAll(writeProps) - .flinkConf(readableConfig) - .append(); - } else { - return FlinkSink.forRowData(dataStream) - .tableLoader(tableLoader) - .tableSchema(tableSchema) - .equalityFieldColumns(equalityColumns) - .overwrite(overwrite) - .setAll(writeProps) - .flinkConf(readableConfig) - .append(); - } - } - }; - } - } - - @Override - public void applyStaticPartition(Map partition) { - // The flink's PartitionFanoutWriter will handle the static partition write policy - // automatically. - } - - @Override - public ChangelogMode getChangelogMode(ChangelogMode requestedMode) { - ChangelogMode.Builder builder = ChangelogMode.newBuilder(); - for (RowKind kind : requestedMode.getContainedKinds()) { - builder.addContainedKind(kind); - } - return builder.build(); - } - - @Override - public DynamicTableSink copy() { - return new IcebergTableSink(this); - } - - @Override - public String asSummaryString() { - return "Iceberg table sink"; - } - - @Override - public void applyOverwrite(boolean newOverwrite) { - this.overwrite = newOverwrite; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java deleted file mode 100644 index 3ef611f2ded5..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.lang.reflect.Array; -import java.nio.ByteBuffer; -import java.time.LocalDateTime; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.DecimalType; -import org.apache.flink.table.types.logical.LocalZonedTimestampType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.TimestampType; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.DateTimeUtil; -import org.apache.iceberg.util.UUIDUtil; - -public class RowDataWrapper implements StructLike { - - private final LogicalType[] types; - private final PositionalGetter[] getters; - private RowData rowData = null; - - public RowDataWrapper(RowType rowType, Types.StructType struct) { - int size = rowType.getFieldCount(); - - types = (LogicalType[]) Array.newInstance(LogicalType.class, size); - getters = (PositionalGetter[]) Array.newInstance(PositionalGetter.class, size); - - for (int i = 0; i < size; i++) { - types[i] = rowType.getTypeAt(i); - getters[i] = buildGetter(types[i], struct.fields().get(i).type()); - } - } - - public RowDataWrapper wrap(RowData data) { - this.rowData = data; - return this; - } - - @Override - public int size() { - return types.length; - } - - @Override - public T get(int pos, Class javaClass) { - if (rowData.isNullAt(pos)) { - return null; - } else if (getters[pos] != null) { - return javaClass.cast(getters[pos].get(rowData, pos)); - } - - Object value = FlinkRowData.createFieldGetter(types[pos], pos).getFieldOrNull(rowData); - return javaClass.cast(value); - } - - @Override - public void set(int pos, T value) { - throw new UnsupportedOperationException( - "Could not set a field in the RowDataWrapper because rowData is read-only"); - } - - private interface PositionalGetter { - T get(RowData data, int pos); - } - - private static PositionalGetter buildGetter(LogicalType logicalType, Type type) { - switch (logicalType.getTypeRoot()) { - case TINYINT: - return (row, pos) -> (int) row.getByte(pos); - case SMALLINT: - return (row, pos) -> (int) row.getShort(pos); - case CHAR: - case VARCHAR: - return (row, pos) -> row.getString(pos).toString(); - - case BINARY: - case VARBINARY: - if (Type.TypeID.UUID == type.typeId()) { - return (row, pos) -> UUIDUtil.convert(row.getBinary(pos)); - } else { - return (row, pos) -> ByteBuffer.wrap(row.getBinary(pos)); - } - - case DECIMAL: - DecimalType decimalType = (DecimalType) logicalType; - return (row, pos) -> - row.getDecimal(pos, decimalType.getPrecision(), decimalType.getScale()).toBigDecimal(); - - case TIME_WITHOUT_TIME_ZONE: - // Time in RowData is in milliseconds (Integer), while iceberg's time is microseconds - // (Long). - return (row, pos) -> ((long) row.getInt(pos)) * 1_000; - - case TIMESTAMP_WITHOUT_TIME_ZONE: - TimestampType timestampType = (TimestampType) logicalType; - return (row, pos) -> { - LocalDateTime localDateTime = - row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); - return DateTimeUtil.microsFromTimestamp(localDateTime); - }; - - case TIMESTAMP_WITH_LOCAL_TIME_ZONE: - LocalZonedTimestampType lzTs = (LocalZonedTimestampType) logicalType; - return (row, pos) -> { - TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); - return timestampData.getMillisecond() * 1000 - + timestampData.getNanoOfMillisecond() / 1000; - }; - - case ROW: - RowType rowType = (RowType) logicalType; - Types.StructType structType = (Types.StructType) type; - - RowDataWrapper nestedWrapper = new RowDataWrapper(rowType, structType); - return (row, pos) -> nestedWrapper.wrap(row.getRow(pos, rowType.getFieldCount())); - - default: - return null; - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java deleted file mode 100644 index da509451fee7..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.Closeable; -import java.io.IOException; -import java.io.Serializable; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.hadoop.SerializableConfiguration; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; - -/** - * Serializable loader to load an Iceberg {@link Table}. Flink needs to get {@link Table} objects in - * the cluster (for example, to get splits), not just on the client side. So we need an Iceberg - * table loader to get the {@link Table} object. - */ -public interface TableLoader extends Closeable, Serializable, Cloneable { - - void open(); - - boolean isOpen(); - - Table loadTable(); - - /** Clone a TableLoader */ - @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) - TableLoader clone(); - - static TableLoader fromCatalog(CatalogLoader catalogLoader, TableIdentifier identifier) { - return new CatalogTableLoader(catalogLoader, identifier); - } - - static TableLoader fromHadoopTable(String location) { - return fromHadoopTable(location, FlinkCatalogFactory.clusterHadoopConf()); - } - - static TableLoader fromHadoopTable(String location, Configuration hadoopConf) { - return new HadoopTableLoader(location, hadoopConf); - } - - class HadoopTableLoader implements TableLoader { - - private static final long serialVersionUID = 1L; - - private final String location; - private final SerializableConfiguration hadoopConf; - - private transient HadoopTables tables; - - private HadoopTableLoader(String location, Configuration conf) { - this.location = location; - this.hadoopConf = new SerializableConfiguration(conf); - } - - @Override - public void open() { - tables = new HadoopTables(hadoopConf.get()); - } - - @Override - public boolean isOpen() { - return tables != null; - } - - @Override - public Table loadTable() { - FlinkEnvironmentContext.init(); - return tables.load(location); - } - - @Override - @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) - public TableLoader clone() { - return new HadoopTableLoader(location, new Configuration(hadoopConf.get())); - } - - @Override - public void close() {} - - @Override - public String toString() { - return MoreObjects.toStringHelper(this).add("location", location).toString(); - } - } - - class CatalogTableLoader implements TableLoader { - - private static final long serialVersionUID = 1L; - - private final CatalogLoader catalogLoader; - private final String identifier; - - private transient Catalog catalog; - - private CatalogTableLoader(CatalogLoader catalogLoader, TableIdentifier tableIdentifier) { - this.catalogLoader = catalogLoader; - this.identifier = tableIdentifier.toString(); - } - - @Override - public void open() { - catalog = catalogLoader.loadCatalog(); - } - - @Override - public boolean isOpen() { - return catalog != null; - } - - @Override - public Table loadTable() { - FlinkEnvironmentContext.init(); - return catalog.loadTable(TableIdentifier.parse(identifier)); - } - - @Override - public void close() throws IOException { - if (catalog instanceof Closeable) { - ((Closeable) catalog).close(); - } - - catalog = null; - } - - @Override - @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) - public TableLoader clone() { - return new CatalogTableLoader(catalogLoader.clone(), TableIdentifier.parse(identifier)); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("tableIdentifier", identifier) - .add("catalogLoader", catalogLoader) - .toString(); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java deleted file mode 100644 index 72a646991456..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.List; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.BigIntType; -import org.apache.flink.table.types.logical.BinaryType; -import org.apache.flink.table.types.logical.BooleanType; -import org.apache.flink.table.types.logical.DateType; -import org.apache.flink.table.types.logical.DecimalType; -import org.apache.flink.table.types.logical.DoubleType; -import org.apache.flink.table.types.logical.FloatType; -import org.apache.flink.table.types.logical.IntType; -import org.apache.flink.table.types.logical.LocalZonedTimestampType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.NullType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.TimeType; -import org.apache.flink.table.types.logical.TimestampType; -import org.apache.flink.table.types.logical.VarBinaryType; -import org.apache.flink.table.types.logical.VarCharType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; - -class TypeToFlinkType extends TypeUtil.SchemaVisitor { - TypeToFlinkType() {} - - @Override - public LogicalType schema(Schema schema, LogicalType structType) { - return structType; - } - - @Override - public LogicalType struct(Types.StructType struct, List fieldResults) { - List fields = struct.fields(); - - List flinkFields = Lists.newArrayListWithExpectedSize(fieldResults.size()); - for (int i = 0; i < fields.size(); i += 1) { - Types.NestedField field = fields.get(i); - LogicalType type = fieldResults.get(i); - RowType.RowField flinkField = - new RowType.RowField(field.name(), type.copy(field.isOptional()), field.doc()); - flinkFields.add(flinkField); - } - - return new RowType(flinkFields); - } - - @Override - public LogicalType field(Types.NestedField field, LogicalType fieldResult) { - return fieldResult; - } - - @Override - public LogicalType list(Types.ListType list, LogicalType elementResult) { - return new ArrayType(elementResult.copy(list.isElementOptional())); - } - - @Override - public LogicalType map(Types.MapType map, LogicalType keyResult, LogicalType valueResult) { - // keys in map are not allowed to be null. - return new MapType(keyResult.copy(false), valueResult.copy(map.isValueOptional())); - } - - @Override - public LogicalType primitive(Type.PrimitiveType primitive) { - switch (primitive.typeId()) { - case UNKNOWN: - return new NullType(); - case BOOLEAN: - return new BooleanType(); - case INTEGER: - return new IntType(); - case LONG: - return new BigIntType(); - case FLOAT: - return new FloatType(); - case DOUBLE: - return new DoubleType(); - case DATE: - return new DateType(); - case TIME: - // For the type: Flink only support TimeType with default precision (second) now. The - // precision of time is - // not supported in Flink, so we can think of it as a simple time type directly. - // For the data: Flink uses int that support mills to represent time data, so it supports - // mills precision. - return new TimeType(); - case TIMESTAMP: - Types.TimestampType timestamp = (Types.TimestampType) primitive; - if (timestamp.shouldAdjustToUTC()) { - // MICROS - return new LocalZonedTimestampType(6); - } else { - // MICROS - return new TimestampType(6); - } - case TIMESTAMP_NANO: - Types.TimestampNanoType timestamp9 = (Types.TimestampNanoType) primitive; - if (timestamp9.shouldAdjustToUTC()) { - // NANOS - return new LocalZonedTimestampType(9); - } else { - // NANOS - return new TimestampType(9); - } - case STRING: - return new VarCharType(VarCharType.MAX_LENGTH); - case UUID: - // UUID length is 16 - return new BinaryType(16); - case FIXED: - Types.FixedType fixedType = (Types.FixedType) primitive; - return new BinaryType(fixedType.length()); - case BINARY: - return new VarBinaryType(VarBinaryType.MAX_LENGTH); - case DECIMAL: - Types.DecimalType decimal = (Types.DecimalType) primitive; - return new DecimalType(decimal.precision(), decimal.scale()); - default: - throw new UnsupportedOperationException( - "Cannot convert unknown type to Flink: " + primitive); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java deleted file mode 100644 index b96b47c5a785..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.actions; - -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.iceberg.Table; - -public class Actions { - - public static final Configuration CONFIG = - new Configuration() - // disable classloader check as Avro may cache class/object in the serializers. - .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - - private final StreamExecutionEnvironment env; - private final Table table; - - private Actions(StreamExecutionEnvironment env, Table table) { - this.env = env; - this.table = table; - } - - public static Actions forTable(StreamExecutionEnvironment env, Table table) { - return new Actions(env, table); - } - - public static Actions forTable(Table table) { - return new Actions(StreamExecutionEnvironment.getExecutionEnvironment(CONFIG), table); - } - - public RewriteDataFilesAction rewriteDataFiles() { - return new RewriteDataFilesAction(env, table); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java deleted file mode 100644 index 4cf30ed90418..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.actions; - -import java.util.List; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableUtil; -import org.apache.iceberg.actions.BaseRewriteDataFilesAction; -import org.apache.iceberg.flink.source.RowDataRewriter; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -public class RewriteDataFilesAction extends BaseRewriteDataFilesAction { - - private final StreamExecutionEnvironment env; - private int maxParallelism; - - public RewriteDataFilesAction(StreamExecutionEnvironment env, Table table) { - super(table); - this.env = env; - this.maxParallelism = env.getParallelism(); - Preconditions.checkArgument( - !TableUtil.supportsRowLineage(table), - "Flink does not support compaction on row lineage enabled tables (V3+)"); - } - - @Override - protected FileIO fileIO() { - return table().io(); - } - - @Override - protected List rewriteDataForTasks(List combinedScanTasks) { - int size = combinedScanTasks.size(); - int parallelism = Math.min(size, maxParallelism); - DataStream dataStream = env.fromCollection(combinedScanTasks); - RowDataRewriter rowDataRewriter = - new RowDataRewriter(table(), caseSensitive(), fileIO(), encryptionManager()); - try { - return rowDataRewriter.rewriteDataForTasks(dataStream, parallelism); - } catch (Exception e) { - throw new RuntimeException("Rewrite data file error.", e); - } - } - - @Override - protected RewriteDataFilesAction self() { - return this; - } - - public RewriteDataFilesAction maxParallelism(int parallelism) { - Preconditions.checkArgument(parallelism > 0, "Invalid max parallelism %s", parallelism); - this.maxParallelism = parallelism; - return this; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java deleted file mode 100644 index 8103224a0b6c..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.LogicalTypeFamily; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.NullType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.avro.AvroWithPartnerByStructureVisitor; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.util.Pair; - -public abstract class AvroWithFlinkSchemaVisitor - extends AvroWithPartnerByStructureVisitor { - - @Override - protected boolean isStringType(LogicalType logicalType) { - return logicalType.getTypeRoot().getFamilies().contains(LogicalTypeFamily.CHARACTER_STRING); - } - - @Override - protected boolean isMapType(LogicalType logicalType) { - return logicalType instanceof MapType; - } - - @Override - protected LogicalType arrayElementType(LogicalType arrayType) { - Preconditions.checkArgument( - arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); - return ((ArrayType) arrayType).getElementType(); - } - - @Override - protected LogicalType mapKeyType(LogicalType mapType) { - Preconditions.checkArgument(isMapType(mapType), "Invalid map: %s is not a map", mapType); - return ((MapType) mapType).getKeyType(); - } - - @Override - protected LogicalType mapValueType(LogicalType mapType) { - Preconditions.checkArgument(isMapType(mapType), "Invalid map: %s is not a map", mapType); - return ((MapType) mapType).getValueType(); - } - - @Override - protected Pair fieldNameAndType(LogicalType structType, int pos) { - Preconditions.checkArgument( - structType instanceof RowType, "Invalid struct: %s is not a struct", structType); - RowType.RowField field = ((RowType) structType).getFields().get(pos); - return Pair.of(field.getName(), field.getType()); - } - - @Override - protected LogicalType nullType() { - return new NullType(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java deleted file mode 100644 index 66ed95792e62..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.io.IOException; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import java.util.stream.Stream; -import org.apache.avro.LogicalTypes; -import org.apache.avro.Schema; -import org.apache.avro.io.Encoder; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FieldMetrics; -import org.apache.iceberg.avro.MetricsAwareDatumWriter; -import org.apache.iceberg.avro.ValueWriter; -import org.apache.iceberg.avro.ValueWriters; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -public class FlinkAvroWriter implements MetricsAwareDatumWriter { - private final RowType rowType; - private ValueWriter writer = null; - - public FlinkAvroWriter(RowType rowType) { - this.rowType = rowType; - } - - @Override - @SuppressWarnings("unchecked") - public void setSchema(Schema schema) { - this.writer = - (ValueWriter) - AvroWithFlinkSchemaVisitor.visit(rowType, schema, new WriteBuilder()); - } - - @Override - public void write(RowData datum, Encoder out) throws IOException { - writer.write(datum, out); - } - - @Override - public Stream metrics() { - return writer.metrics(); - } - - private static class WriteBuilder extends AvroWithFlinkSchemaVisitor> { - @Override - public ValueWriter record( - LogicalType struct, Schema record, List names, List> fields) { - return FlinkValueWriters.row( - fields, - IntStream.range(0, names.size()) - .mapToObj(i -> fieldNameAndType(struct, i).second()) - .collect(Collectors.toList())); - } - - @Override - public ValueWriter union(LogicalType type, Schema union, List> options) { - Preconditions.checkArgument( - options.contains(ValueWriters.nulls()), - "Cannot create writer for non-option union: %s", - union); - Preconditions.checkArgument( - options.size() == 2, "Cannot create writer for non-option union: %s", union); - if (union.getTypes().get(0).getType() == Schema.Type.NULL) { - return ValueWriters.option(0, options.get(1)); - } else { - return ValueWriters.option(1, options.get(0)); - } - } - - @Override - public ValueWriter array(LogicalType sArray, Schema array, ValueWriter elementWriter) { - return FlinkValueWriters.array(elementWriter, arrayElementType(sArray)); - } - - @Override - public ValueWriter map(LogicalType sMap, Schema map, ValueWriter valueReader) { - return FlinkValueWriters.map( - FlinkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); - } - - @Override - public ValueWriter map( - LogicalType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { - return FlinkValueWriters.arrayMap( - keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); - } - - @Override - public ValueWriter primitive(LogicalType type, Schema primitive) { - org.apache.avro.LogicalType logicalType = primitive.getLogicalType(); - if (logicalType != null) { - switch (logicalType.getName()) { - case "date": - return ValueWriters.ints(); - - case "time-micros": - return FlinkValueWriters.timeMicros(); - - case "timestamp-micros": - return FlinkValueWriters.timestampMicros(); - - case "timestamp-nanos": - return FlinkValueWriters.timestampNanos(); - - case "decimal": - LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType; - return FlinkValueWriters.decimal(decimal.getPrecision(), decimal.getScale()); - - case "uuid": - return ValueWriters.uuids(); - - default: - throw new IllegalArgumentException("Unsupported logical type: " + logicalType); - } - } - - switch (primitive.getType()) { - case NULL: - return ValueWriters.nulls(); - case BOOLEAN: - return ValueWriters.booleans(); - case INT: - switch (type.getTypeRoot()) { - case TINYINT: - return ValueWriters.tinyints(); - case SMALLINT: - return ValueWriters.shorts(); - default: - return ValueWriters.ints(); - } - case LONG: - return ValueWriters.longs(); - case FLOAT: - return ValueWriters.floats(); - case DOUBLE: - return ValueWriters.doubles(); - case STRING: - return FlinkValueWriters.strings(); - case FIXED: - return ValueWriters.fixed(primitive.getFixedSize()); - case BYTES: - return ValueWriters.bytes(); - default: - throw new IllegalArgumentException("Unsupported type: " + primitive); - } - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java deleted file mode 100644 index 65b9d44ad4b8..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.util.List; -import java.util.Map; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.orc.OrcRowReader; -import org.apache.iceberg.orc.OrcSchemaWithTypeVisitor; -import org.apache.iceberg.orc.OrcValueReader; -import org.apache.iceberg.orc.OrcValueReaders; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.orc.TypeDescription; -import org.apache.orc.storage.ql.exec.vector.StructColumnVector; -import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; - -public class FlinkOrcReader implements OrcRowReader { - private final OrcValueReader reader; - - public FlinkOrcReader(Schema iSchema, TypeDescription readSchema) { - this(iSchema, readSchema, ImmutableMap.of()); - } - - public FlinkOrcReader(Schema iSchema, TypeDescription readSchema, Map idToConstant) { - this.reader = - OrcSchemaWithTypeVisitor.visit(iSchema, readSchema, new ReadBuilder(idToConstant)); - } - - @Override - public RowData read(VectorizedRowBatch batch, int row) { - return (RowData) reader.read(new StructColumnVector(batch.size, batch.cols), row); - } - - @Override - public void setBatchContext(long batchOffsetInFile) { - reader.setBatchContext(batchOffsetInFile); - } - - private static class ReadBuilder extends OrcSchemaWithTypeVisitor> { - private final Map idToConstant; - - private ReadBuilder(Map idToConstant) { - this.idToConstant = idToConstant; - } - - @Override - public OrcValueReader record( - Types.StructType iStruct, - TypeDescription record, - List names, - List> fields) { - return FlinkOrcReaders.struct(fields, iStruct, idToConstant); - } - - @Override - public OrcValueReader list( - Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { - return FlinkOrcReaders.array(elementReader); - } - - @Override - public OrcValueReader map( - Types.MapType iMap, - TypeDescription map, - OrcValueReader keyReader, - OrcValueReader valueReader) { - return FlinkOrcReaders.map(keyReader, valueReader); - } - - @Override - public OrcValueReader primitive(Type.PrimitiveType iPrimitive, TypeDescription primitive) { - switch (iPrimitive.typeId()) { - case BOOLEAN: - return OrcValueReaders.booleans(); - case INTEGER: - return OrcValueReaders.ints(); - case LONG: - return OrcValueReaders.longs(); - case FLOAT: - return OrcValueReaders.floats(); - case DOUBLE: - return OrcValueReaders.doubles(); - case DATE: - return FlinkOrcReaders.dates(); - case TIME: - return FlinkOrcReaders.times(); - case TIMESTAMP: - Types.TimestampType timestampType = (Types.TimestampType) iPrimitive; - if (timestampType.shouldAdjustToUTC()) { - return FlinkOrcReaders.timestampTzs(); - } else { - return FlinkOrcReaders.timestamps(); - } - case STRING: - return FlinkOrcReaders.strings(); - case UUID: - case FIXED: - case BINARY: - return OrcValueReaders.bytes(); - case DECIMAL: - Types.DecimalType decimalType = (Types.DecimalType) iPrimitive; - return FlinkOrcReaders.decimals(decimalType.precision(), decimalType.scale()); - default: - throw new IllegalArgumentException( - String.format( - "Invalid iceberg type %s corresponding to ORC type %s", iPrimitive, primitive)); - } - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java deleted file mode 100644 index 7a4a15c7e600..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java +++ /dev/null @@ -1,283 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.math.BigDecimal; -import java.time.Instant; -import java.time.LocalDateTime; -import java.time.ZoneOffset; -import java.util.List; -import java.util.Map; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.iceberg.orc.OrcValueReader; -import org.apache.iceberg.orc.OrcValueReaders; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; -import org.apache.orc.storage.ql.exec.vector.ColumnVector; -import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; -import org.apache.orc.storage.ql.exec.vector.ListColumnVector; -import org.apache.orc.storage.ql.exec.vector.LongColumnVector; -import org.apache.orc.storage.ql.exec.vector.MapColumnVector; -import org.apache.orc.storage.ql.exec.vector.TimestampColumnVector; -import org.apache.orc.storage.serde2.io.HiveDecimalWritable; - -class FlinkOrcReaders { - private FlinkOrcReaders() {} - - static OrcValueReader strings() { - return StringReader.INSTANCE; - } - - static OrcValueReader dates() { - return DateReader.INSTANCE; - } - - static OrcValueReader decimals(int precision, int scale) { - if (precision <= 18) { - return new Decimal18Reader(precision, scale); - } else if (precision <= 38) { - return new Decimal38Reader(precision, scale); - } else { - throw new IllegalArgumentException("Invalid precision: " + precision); - } - } - - static OrcValueReader times() { - return TimeReader.INSTANCE; - } - - static OrcValueReader timestamps() { - return TimestampReader.INSTANCE; - } - - static OrcValueReader timestampTzs() { - return TimestampTzReader.INSTANCE; - } - - static OrcValueReader array(OrcValueReader elementReader) { - return new ArrayReader<>(elementReader); - } - - public static OrcValueReader map( - OrcValueReader keyReader, OrcValueReader valueReader) { - return new MapReader<>(keyReader, valueReader); - } - - public static OrcValueReader struct( - List> readers, Types.StructType struct, Map idToConstant) { - return new StructReader(readers, struct, idToConstant); - } - - private static class StringReader implements OrcValueReader { - private static final StringReader INSTANCE = new StringReader(); - - @Override - public StringData nonNullRead(ColumnVector vector, int row) { - BytesColumnVector bytesVector = (BytesColumnVector) vector; - return StringData.fromBytes( - bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); - } - } - - private static class DateReader implements OrcValueReader { - private static final DateReader INSTANCE = new DateReader(); - - @Override - public Integer nonNullRead(ColumnVector vector, int row) { - return (int) ((LongColumnVector) vector).vector[row]; - } - } - - private static class Decimal18Reader implements OrcValueReader { - private final int precision; - private final int scale; - - Decimal18Reader(int precision, int scale) { - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData nonNullRead(ColumnVector vector, int row) { - HiveDecimalWritable value = ((DecimalColumnVector) vector).vector[row]; - - // The hive ORC writer may will adjust the scale of decimal data. - Preconditions.checkArgument( - value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", - precision, - scale, - value); - - return DecimalData.fromUnscaledLong(value.serialize64(scale), precision, scale); - } - } - - private static class Decimal38Reader implements OrcValueReader { - private final int precision; - private final int scale; - - Decimal38Reader(int precision, int scale) { - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData nonNullRead(ColumnVector vector, int row) { - BigDecimal value = - ((DecimalColumnVector) vector).vector[row].getHiveDecimal().bigDecimalValue(); - - Preconditions.checkArgument( - value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", - precision, - scale, - value); - - return DecimalData.fromBigDecimal(value, precision, scale); - } - } - - private static class TimeReader implements OrcValueReader { - private static final TimeReader INSTANCE = new TimeReader(); - - @Override - public Integer nonNullRead(ColumnVector vector, int row) { - long micros = ((LongColumnVector) vector).vector[row]; - // Flink only support time mills, just erase micros. - return (int) (micros / 1000); - } - } - - private static class TimestampReader implements OrcValueReader { - private static final TimestampReader INSTANCE = new TimestampReader(); - - @Override - public TimestampData nonNullRead(ColumnVector vector, int row) { - TimestampColumnVector tcv = (TimestampColumnVector) vector; - LocalDateTime localDate = - Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) - .atOffset(ZoneOffset.UTC) - .toLocalDateTime(); - return TimestampData.fromLocalDateTime(localDate); - } - } - - private static class TimestampTzReader implements OrcValueReader { - private static final TimestampTzReader INSTANCE = new TimestampTzReader(); - - @Override - public TimestampData nonNullRead(ColumnVector vector, int row) { - TimestampColumnVector tcv = (TimestampColumnVector) vector; - Instant instant = - Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) - .atOffset(ZoneOffset.UTC) - .toInstant(); - return TimestampData.fromInstant(instant); - } - } - - private static class ArrayReader implements OrcValueReader { - private final OrcValueReader elementReader; - - private ArrayReader(OrcValueReader elementReader) { - this.elementReader = elementReader; - } - - @Override - public ArrayData nonNullRead(ColumnVector vector, int row) { - ListColumnVector listVector = (ListColumnVector) vector; - int offset = (int) listVector.offsets[row]; - int length = (int) listVector.lengths[row]; - List elements = Lists.newArrayListWithExpectedSize(length); - for (int c = 0; c < length; ++c) { - elements.add(elementReader.read(listVector.child, offset + c)); - } - return new GenericArrayData(elements.toArray()); - } - - @Override - public void setBatchContext(long batchOffsetInFile) { - elementReader.setBatchContext(batchOffsetInFile); - } - } - - private static class MapReader implements OrcValueReader { - private final OrcValueReader keyReader; - private final OrcValueReader valueReader; - - private MapReader(OrcValueReader keyReader, OrcValueReader valueReader) { - this.keyReader = keyReader; - this.valueReader = valueReader; - } - - @Override - public MapData nonNullRead(ColumnVector vector, int row) { - MapColumnVector mapVector = (MapColumnVector) vector; - int offset = (int) mapVector.offsets[row]; - long length = mapVector.lengths[row]; - - Map map = Maps.newHashMap(); - for (int c = 0; c < length; c++) { - K key = keyReader.read(mapVector.keys, offset + c); - V value = valueReader.read(mapVector.values, offset + c); - map.put(key, value); - } - - return new GenericMapData(map); - } - - @Override - public void setBatchContext(long batchOffsetInFile) { - keyReader.setBatchContext(batchOffsetInFile); - valueReader.setBatchContext(batchOffsetInFile); - } - } - - private static class StructReader extends OrcValueReaders.StructReader { - private final int numFields; - - StructReader( - List> readers, Types.StructType struct, Map idToConstant) { - super(readers, struct, idToConstant); - this.numFields = struct.fields().size(); - } - - @Override - protected RowData create() { - return new GenericRowData(numFields); - } - - @Override - protected void set(RowData struct, int pos, Object value) { - ((GenericRowData) struct).setField(pos, value); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java deleted file mode 100644 index 6a31accffd22..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.util.Deque; -import java.util.List; -import java.util.stream.Stream; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FieldMetrics; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.orc.GenericOrcWriters; -import org.apache.iceberg.orc.OrcRowWriter; -import org.apache.iceberg.orc.OrcValueWriter; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; - -public class FlinkOrcWriter implements OrcRowWriter { - private final FlinkOrcWriters.RowDataWriter writer; - - private FlinkOrcWriter(RowType rowType, Schema iSchema) { - this.writer = - (FlinkOrcWriters.RowDataWriter) - FlinkSchemaVisitor.visit(rowType, iSchema, new WriteBuilder()); - } - - public static OrcRowWriter buildWriter(RowType rowType, Schema iSchema) { - return new FlinkOrcWriter(rowType, iSchema); - } - - @Override - public void write(RowData row, VectorizedRowBatch output) { - Preconditions.checkArgument(row != null, "value must not be null"); - writer.writeRow(row, output); - } - - @Override - public List> writers() { - return writer.writers(); - } - - @Override - public Stream> metrics() { - return writer.metrics(); - } - - private static class WriteBuilder extends FlinkSchemaVisitor> { - private final Deque fieldIds = Lists.newLinkedList(); - - private WriteBuilder() {} - - @Override - public void beforeField(Types.NestedField field) { - fieldIds.push(field.fieldId()); - } - - @Override - public void afterField(Types.NestedField field) { - fieldIds.pop(); - } - - @Override - public OrcValueWriter record( - Types.StructType iStruct, List> results, List fieldType) { - return FlinkOrcWriters.struct(results, fieldType); - } - - @Override - public OrcValueWriter map( - Types.MapType iMap, - OrcValueWriter key, - OrcValueWriter value, - LogicalType keyType, - LogicalType valueType) { - return FlinkOrcWriters.map(key, value, keyType, valueType); - } - - @Override - public OrcValueWriter list( - Types.ListType iList, OrcValueWriter element, LogicalType elementType) { - return FlinkOrcWriters.list(element, elementType); - } - - @Override - public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, LogicalType flinkPrimitive) { - switch (iPrimitive.typeId()) { - case BOOLEAN: - return GenericOrcWriters.booleans(); - case INTEGER: - switch (flinkPrimitive.getTypeRoot()) { - case TINYINT: - return GenericOrcWriters.bytes(); - case SMALLINT: - return GenericOrcWriters.shorts(); - } - return GenericOrcWriters.ints(); - case LONG: - return GenericOrcWriters.longs(); - case FLOAT: - Preconditions.checkArgument( - fieldIds.peek() != null, - String.format( - "[BUG] Cannot find field id for primitive field with type %s. This is likely because id " - + "information is not properly pushed during schema visiting.", - iPrimitive)); - return GenericOrcWriters.floats(fieldIds.peek()); - case DOUBLE: - Preconditions.checkArgument( - fieldIds.peek() != null, - String.format( - "[BUG] Cannot find field id for primitive field with type %s. This is likely because id " - + "information is not properly pushed during schema visiting.", - iPrimitive)); - return GenericOrcWriters.doubles(fieldIds.peek()); - case DATE: - return FlinkOrcWriters.dates(); - case TIME: - return FlinkOrcWriters.times(); - case TIMESTAMP: - Types.TimestampType timestampType = (Types.TimestampType) iPrimitive; - if (timestampType.shouldAdjustToUTC()) { - return FlinkOrcWriters.timestampTzs(); - } else { - return FlinkOrcWriters.timestamps(); - } - case STRING: - return FlinkOrcWriters.strings(); - case UUID: - case FIXED: - case BINARY: - return GenericOrcWriters.byteArrays(); - case DECIMAL: - Types.DecimalType decimalType = (Types.DecimalType) iPrimitive; - return FlinkOrcWriters.decimals(decimalType.precision(), decimalType.scale()); - default: - throw new IllegalArgumentException( - String.format( - "Invalid iceberg type %s corresponding to Flink logical type %s", - iPrimitive, flinkPrimitive)); - } - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java deleted file mode 100644 index afce2cda1db1..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java +++ /dev/null @@ -1,318 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.time.Instant; -import java.time.OffsetDateTime; -import java.time.ZoneOffset; -import java.util.List; -import java.util.stream.Stream; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.iceberg.FieldMetrics; -import org.apache.iceberg.data.orc.GenericOrcWriters; -import org.apache.iceberg.flink.FlinkRowData; -import org.apache.iceberg.orc.OrcValueWriter; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.orc.storage.common.type.HiveDecimal; -import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; -import org.apache.orc.storage.ql.exec.vector.ColumnVector; -import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; -import org.apache.orc.storage.ql.exec.vector.ListColumnVector; -import org.apache.orc.storage.ql.exec.vector.LongColumnVector; -import org.apache.orc.storage.ql.exec.vector.MapColumnVector; -import org.apache.orc.storage.ql.exec.vector.TimestampColumnVector; - -class FlinkOrcWriters { - - private FlinkOrcWriters() {} - - static OrcValueWriter strings() { - return StringWriter.INSTANCE; - } - - static OrcValueWriter dates() { - return DateWriter.INSTANCE; - } - - static OrcValueWriter times() { - return TimeWriter.INSTANCE; - } - - static OrcValueWriter timestamps() { - return TimestampWriter.INSTANCE; - } - - static OrcValueWriter timestampTzs() { - return TimestampTzWriter.INSTANCE; - } - - static OrcValueWriter decimals(int precision, int scale) { - if (precision <= 18) { - return new Decimal18Writer(precision, scale); - } else if (precision <= 38) { - return new Decimal38Writer(precision, scale); - } else { - throw new IllegalArgumentException("Invalid precision: " + precision); - } - } - - static OrcValueWriter list( - OrcValueWriter elementWriter, LogicalType elementType) { - return new ListWriter<>(elementWriter, elementType); - } - - static OrcValueWriter map( - OrcValueWriter keyWriter, - OrcValueWriter valueWriter, - LogicalType keyType, - LogicalType valueType) { - return new MapWriter<>(keyWriter, valueWriter, keyType, valueType); - } - - static OrcValueWriter struct(List> writers, List types) { - return new RowDataWriter(writers, types); - } - - private static class StringWriter implements OrcValueWriter { - private static final StringWriter INSTANCE = new StringWriter(); - - @Override - public void nonNullWrite(int rowId, StringData data, ColumnVector output) { - byte[] value = data.toBytes(); - ((BytesColumnVector) output).setRef(rowId, value, 0, value.length); - } - } - - private static class DateWriter implements OrcValueWriter { - private static final DateWriter INSTANCE = new DateWriter(); - - @Override - public void nonNullWrite(int rowId, Integer data, ColumnVector output) { - ((LongColumnVector) output).vector[rowId] = data; - } - } - - private static class TimeWriter implements OrcValueWriter { - private static final TimeWriter INSTANCE = new TimeWriter(); - - @Override - public void nonNullWrite(int rowId, Integer millis, ColumnVector output) { - // The time in flink is in millisecond, while the standard time in iceberg is microsecond. - // So we need to transform it to microsecond. - ((LongColumnVector) output).vector[rowId] = millis * 1000L; - } - } - - private static class TimestampWriter implements OrcValueWriter { - private static final TimestampWriter INSTANCE = new TimestampWriter(); - - @Override - public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { - TimestampColumnVector cv = (TimestampColumnVector) output; - cv.setIsUTC(true); - // millis - OffsetDateTime offsetDateTime = data.toInstant().atOffset(ZoneOffset.UTC); - cv.time[rowId] = - offsetDateTime.toEpochSecond() * 1_000 + offsetDateTime.getNano() / 1_000_000; - // truncate nanos to only keep microsecond precision. - cv.nanos[rowId] = (offsetDateTime.getNano() / 1_000) * 1_000; - } - } - - private static class TimestampTzWriter implements OrcValueWriter { - private static final TimestampTzWriter INSTANCE = new TimestampTzWriter(); - - @SuppressWarnings("JavaInstantGetSecondsGetNano") - @Override - public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { - TimestampColumnVector cv = (TimestampColumnVector) output; - // millis - Instant instant = data.toInstant(); - cv.time[rowId] = instant.toEpochMilli(); - // truncate nanos to only keep microsecond precision. - cv.nanos[rowId] = (instant.getNano() / 1_000) * 1_000; - } - } - - private static class Decimal18Writer implements OrcValueWriter { - private final int precision; - private final int scale; - - Decimal18Writer(int precision, int scale) { - this.precision = precision; - this.scale = scale; - } - - @Override - public void nonNullWrite(int rowId, DecimalData data, ColumnVector output) { - Preconditions.checkArgument( - scale == data.scale(), - "Cannot write value as decimal(%s,%s), wrong scale: %s", - precision, - scale, - data); - Preconditions.checkArgument( - data.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", - precision, - scale, - data); - - ((DecimalColumnVector) output) - .vector[rowId].setFromLongAndScale(data.toUnscaledLong(), data.scale()); - } - } - - private static class Decimal38Writer implements OrcValueWriter { - private final int precision; - private final int scale; - - Decimal38Writer(int precision, int scale) { - this.precision = precision; - this.scale = scale; - } - - @Override - public void nonNullWrite(int rowId, DecimalData data, ColumnVector output) { - Preconditions.checkArgument( - scale == data.scale(), - "Cannot write value as decimal(%s,%s), wrong scale: %s", - precision, - scale, - data); - Preconditions.checkArgument( - data.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", - precision, - scale, - data); - - ((DecimalColumnVector) output) - .vector[rowId].set(HiveDecimal.create(data.toBigDecimal(), false)); - } - } - - static class ListWriter implements OrcValueWriter { - private final OrcValueWriter elementWriter; - private final ArrayData.ElementGetter elementGetter; - - ListWriter(OrcValueWriter elementWriter, LogicalType elementType) { - this.elementWriter = elementWriter; - this.elementGetter = ArrayData.createElementGetter(elementType); - } - - @Override - @SuppressWarnings("unchecked") - public void nonNullWrite(int rowId, ArrayData data, ColumnVector output) { - ListColumnVector cv = (ListColumnVector) output; - cv.lengths[rowId] = data.size(); - cv.offsets[rowId] = cv.childCount; - cv.childCount = (int) (cv.childCount + cv.lengths[rowId]); - // make sure the child is big enough. - growColumnVector(cv.child, cv.childCount); - - for (int e = 0; e < cv.lengths[rowId]; ++e) { - Object value = elementGetter.getElementOrNull(data, e); - elementWriter.write((int) (e + cv.offsets[rowId]), (T) value, cv.child); - } - } - - @Override - public Stream> metrics() { - return elementWriter.metrics(); - } - } - - static class MapWriter implements OrcValueWriter { - private final OrcValueWriter keyWriter; - private final OrcValueWriter valueWriter; - private final ArrayData.ElementGetter keyGetter; - private final ArrayData.ElementGetter valueGetter; - - MapWriter( - OrcValueWriter keyWriter, - OrcValueWriter valueWriter, - LogicalType keyType, - LogicalType valueType) { - this.keyWriter = keyWriter; - this.valueWriter = valueWriter; - this.keyGetter = ArrayData.createElementGetter(keyType); - this.valueGetter = ArrayData.createElementGetter(valueType); - } - - @Override - @SuppressWarnings("unchecked") - public void nonNullWrite(int rowId, MapData data, ColumnVector output) { - MapColumnVector cv = (MapColumnVector) output; - ArrayData keyArray = data.keyArray(); - ArrayData valArray = data.valueArray(); - - // record the length and start of the list elements - cv.lengths[rowId] = data.size(); - cv.offsets[rowId] = cv.childCount; - cv.childCount = (int) (cv.childCount + cv.lengths[rowId]); - // make sure the child is big enough - growColumnVector(cv.keys, cv.childCount); - growColumnVector(cv.values, cv.childCount); - // Add each element - for (int e = 0; e < cv.lengths[rowId]; ++e) { - int pos = (int) (e + cv.offsets[rowId]); - keyWriter.write(pos, (K) keyGetter.getElementOrNull(keyArray, e), cv.keys); - valueWriter.write(pos, (V) valueGetter.getElementOrNull(valArray, e), cv.values); - } - } - - @Override - public Stream> metrics() { - return Stream.concat(keyWriter.metrics(), valueWriter.metrics()); - } - } - - static class RowDataWriter extends GenericOrcWriters.StructWriter { - private final List fieldGetters; - - RowDataWriter(List> writers, List types) { - super(writers); - - this.fieldGetters = Lists.newArrayListWithExpectedSize(types.size()); - for (int i = 0; i < types.size(); i++) { - fieldGetters.add(FlinkRowData.createFieldGetter(types.get(i), i)); - } - } - - @Override - protected Object get(RowData struct, int index) { - return fieldGetters.get(index).getFieldOrNull(struct); - } - } - - private static void growColumnVector(ColumnVector cv, int requestedSize) { - if (cv.isNull.length < requestedSize) { - // Use growth factor of 3 to avoid frequent array allocations - cv.ensureSize(requestedSize * 3, true); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java deleted file mode 100644 index 5c3581aef3ec..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java +++ /dev/null @@ -1,860 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.math.BigDecimal; -import java.math.BigInteger; -import java.nio.ByteBuffer; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RawValueData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.Schema; -import org.apache.iceberg.parquet.ParquetSchemaUtil; -import org.apache.iceberg.parquet.ParquetValueReader; -import org.apache.iceberg.parquet.ParquetValueReaders; -import org.apache.iceberg.parquet.TypeWithSchemaVisitor; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.io.api.Binary; -import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.LogicalTypeAnnotation; -import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; - -public class FlinkParquetReaders { - private FlinkParquetReaders() {} - - public static ParquetValueReader buildReader( - Schema expectedSchema, MessageType fileSchema) { - return buildReader(expectedSchema, fileSchema, ImmutableMap.of()); - } - - @SuppressWarnings("unchecked") - public static ParquetValueReader buildReader( - Schema expectedSchema, MessageType fileSchema, Map idToConstant) { - return (ParquetValueReader) - TypeWithSchemaVisitor.visit( - expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, idToConstant)); - } - - private static class ReadBuilder extends TypeWithSchemaVisitor> { - private final MessageType type; - private final Map idToConstant; - - ReadBuilder(MessageType type, Map idToConstant) { - this.type = type; - this.idToConstant = idToConstant; - } - - @Override - public ParquetValueReader message( - Types.StructType expected, MessageType message, List> fieldReaders) { - return struct(expected, message.asGroupType(), fieldReaders); - } - - @Override - @SuppressWarnings("checkstyle:CyclomaticComplexity") - public ParquetValueReader struct( - Types.StructType expected, GroupType struct, List> fieldReaders) { - // match the expected struct's order - Map> readersById = Maps.newHashMap(); - Map typesById = Maps.newHashMap(); - Map maxDefinitionLevelsById = Maps.newHashMap(); - List fields = struct.getFields(); - for (int i = 0; i < fields.size(); i += 1) { - Type fieldType = fields.get(i); - if (fieldReaders.get(i) != null) { - int fieldD = type.getMaxDefinitionLevel(path(fieldType.getName())) - 1; - if (fieldType.getId() != null) { - int id = fieldType.getId().intValue(); - readersById.put(id, ParquetValueReaders.option(fieldType, fieldD, fieldReaders.get(i))); - typesById.put(id, fieldType); - if (idToConstant.containsKey(id)) { - maxDefinitionLevelsById.put(id, fieldD); - } - } - } - } - - List expectedFields = - expected != null ? expected.fields() : ImmutableList.of(); - List> reorderedFields = - Lists.newArrayListWithExpectedSize(expectedFields.size()); - // Defaulting to parent max definition level - int defaultMaxDefinitionLevel = type.getMaxDefinitionLevel(currentPath()); - for (Types.NestedField field : expectedFields) { - int id = field.fieldId(); - ParquetValueReader reader = readersById.get(id); - if (idToConstant.containsKey(id)) { - // containsKey is used because the constant may be null - int fieldMaxDefinitionLevel = - maxDefinitionLevelsById.getOrDefault(id, defaultMaxDefinitionLevel); - reorderedFields.add( - ParquetValueReaders.constant(idToConstant.get(id), fieldMaxDefinitionLevel)); - } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { - reorderedFields.add(ParquetValueReaders.position()); - } else if (id == MetadataColumns.IS_DELETED.fieldId()) { - reorderedFields.add(ParquetValueReaders.constant(false)); - } else if (reader != null) { - reorderedFields.add(reader); - } else if (field.initialDefault() != null) { - reorderedFields.add( - ParquetValueReaders.constant( - RowDataUtil.convertConstant(field.type(), field.initialDefault()), - maxDefinitionLevelsById.getOrDefault(id, defaultMaxDefinitionLevel))); - } else if (field.isOptional()) { - reorderedFields.add(ParquetValueReaders.nulls()); - } else { - throw new IllegalArgumentException( - String.format("Missing required field: %s", field.name())); - } - } - - return new RowDataReader(reorderedFields); - } - - @Override - public ParquetValueReader list( - Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { - if (expectedList == null) { - return null; - } - - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; - int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; - - Type elementType = ParquetSchemaUtil.determineListElementType(array); - int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; - - return new ArrayReader<>( - repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); - } - - @Override - public ParquetValueReader map( - Types.MapType expectedMap, - GroupType map, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { - if (expectedMap == null) { - return null; - } - - GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; - int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; - - Type keyType = repeatedKeyValue.getType(0); - int keyD = type.getMaxDefinitionLevel(path(keyType.getName())) - 1; - Type valueType = repeatedKeyValue.getType(1); - int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; - - return new MapReader<>( - repeatedD, - repeatedR, - ParquetValueReaders.option(keyType, keyD, keyReader), - ParquetValueReaders.option(valueType, valueD, valueReader)); - } - - private static class LogicalTypeAnnotationParquetValueReaderVisitor - implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor> { - - private final PrimitiveType primitive; - private final ColumnDescriptor desc; - private final org.apache.iceberg.types.Type.PrimitiveType expected; - - LogicalTypeAnnotationParquetValueReaderVisitor( - PrimitiveType primitive, - ColumnDescriptor desc, - org.apache.iceberg.types.Type.PrimitiveType expected) { - this.primitive = primitive; - this.desc = desc; - this.expected = expected; - } - - @Override - public Optional> visit( - LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { - return Optional.of(new StringReader(desc)); - } - - @Override - public Optional> visit( - LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { - return Optional.of(new StringReader(desc)); - } - - @Override - public Optional> visit( - LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { - return Optional.of(new StringReader(desc)); - } - - @Override - public Optional> visit( - DecimalLogicalTypeAnnotation decimalLogicalType) { - switch (primitive.getPrimitiveTypeName()) { - case BINARY: - case FIXED_LEN_BYTE_ARRAY: - return Optional.of( - new BinaryDecimalReader( - desc, decimalLogicalType.getPrecision(), decimalLogicalType.getScale())); - case INT64: - return Optional.of( - new LongDecimalReader( - desc, decimalLogicalType.getPrecision(), decimalLogicalType.getScale())); - case INT32: - return Optional.of( - new IntegerDecimalReader( - desc, decimalLogicalType.getPrecision(), decimalLogicalType.getScale())); - } - - return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit(decimalLogicalType); - } - - @Override - public Optional> visit( - LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { - return Optional.of(new ParquetValueReaders.UnboxedReader<>(desc)); - } - - @Override - public Optional> visit( - LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { - if (timeLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MILLIS) { - return Optional.of(new MillisTimeReader(desc)); - } else if (timeLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MICROS) { - return Optional.of(new LossyMicrosToMillisTimeReader(desc)); - } - - return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit(timeLogicalType); - } - - @Override - public Optional> visit( - LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { - if (timestampLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MILLIS) { - return Optional.of(new MillisToTimestampReader(desc)); - } else if (timestampLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MICROS) { - return Optional.of(new MicrosToTimestampReader(desc)); - } else if (timestampLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.NANOS) { - return Optional.of(new NanosToTimestampReader(desc)); - } - - return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit(timestampLogicalType); - } - - @Override - public Optional> visit( - LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { - int width = intLogicalType.getBitWidth(); - if (width <= 32) { - if (expected.typeId() == Types.LongType.get().typeId()) { - return Optional.of(new ParquetValueReaders.IntAsLongReader(desc)); - } else { - return Optional.of(new ParquetValueReaders.UnboxedReader<>(desc)); - } - } else if (width <= 64) { - return Optional.of(new ParquetValueReaders.UnboxedReader<>(desc)); - } - - return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit(intLogicalType); - } - - @Override - public Optional> visit( - LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { - return Optional.of(new ParquetValueReaders.ByteArrayReader(desc)); - } - } - - @Override - @SuppressWarnings("CyclomaticComplexity") - public ParquetValueReader primitive( - org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { - if (expected == null) { - return null; - } - - ColumnDescriptor desc = type.getColumnDescription(currentPath()); - LogicalTypeAnnotation logicalTypeAnnotation = primitive.getLogicalTypeAnnotation(); - if (logicalTypeAnnotation != null) { - return logicalTypeAnnotation - .accept(new LogicalTypeAnnotationParquetValueReaderVisitor(primitive, desc, expected)) - .orElseThrow( - () -> - new UnsupportedOperationException( - "Unsupported logical type: " + primitive.getLogicalTypeAnnotation())); - } - - switch (primitive.getPrimitiveTypeName()) { - case FIXED_LEN_BYTE_ARRAY: - case BINARY: - return new ParquetValueReaders.ByteArrayReader(desc); - case INT32: - if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.LONG) { - return new ParquetValueReaders.IntAsLongReader(desc); - } else { - return new ParquetValueReaders.UnboxedReader<>(desc); - } - case FLOAT: - if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.DOUBLE) { - return new ParquetValueReaders.FloatAsDoubleReader(desc); - } else { - return new ParquetValueReaders.UnboxedReader<>(desc); - } - case BOOLEAN: - case INT64: - case DOUBLE: - return new ParquetValueReaders.UnboxedReader<>(desc); - default: - throw new UnsupportedOperationException("Unsupported type: " + primitive); - } - } - } - - private static class BinaryDecimalReader - extends ParquetValueReaders.PrimitiveReader { - private final int precision; - private final int scale; - - BinaryDecimalReader(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData read(DecimalData ignored) { - Binary binary = column.nextBinary(); - BigDecimal bigDecimal = new BigDecimal(new BigInteger(binary.getBytes()), scale); - // TODO: need a unit test to write-read-validate decimal via FlinkParquetWrite/Reader - return DecimalData.fromBigDecimal(bigDecimal, precision, scale); - } - } - - private static class IntegerDecimalReader - extends ParquetValueReaders.PrimitiveReader { - private final int precision; - private final int scale; - - IntegerDecimalReader(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData read(DecimalData ignored) { - return DecimalData.fromUnscaledLong(column.nextInteger(), precision, scale); - } - } - - private static class LongDecimalReader extends ParquetValueReaders.PrimitiveReader { - private final int precision; - private final int scale; - - LongDecimalReader(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData read(DecimalData ignored) { - return DecimalData.fromUnscaledLong(column.nextLong(), precision, scale); - } - } - - private static class NanosToTimestampReader - extends ParquetValueReaders.UnboxedReader { - NanosToTimestampReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData ignored) { - long value = readLong(); - return TimestampData.fromEpochMillis( - Math.floorDiv(value, 1_000_000L), Math.floorMod(value, 1_000_000)); - } - } - - private static class MicrosToTimestampReader - extends ParquetValueReaders.UnboxedReader { - MicrosToTimestampReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData ignored) { - long micros = readLong(); - return TimestampData.fromEpochMillis( - Math.floorDiv(micros, 1000L), Math.floorMod(micros, 1000) * 1000); - } - } - - private static class MillisToTimestampReader - extends ParquetValueReaders.UnboxedReader { - MillisToTimestampReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData ignored) { - long millis = readLong(); - return TimestampData.fromEpochMillis(millis); - } - } - - private static class StringReader extends ParquetValueReaders.PrimitiveReader { - StringReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public StringData read(StringData ignored) { - Binary binary = column.nextBinary(); - ByteBuffer buffer = binary.toByteBuffer(); - if (buffer.hasArray()) { - return StringData.fromBytes( - buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining()); - } else { - return StringData.fromBytes(binary.getBytes()); - } - } - } - - private static class LossyMicrosToMillisTimeReader - extends ParquetValueReaders.PrimitiveReader { - LossyMicrosToMillisTimeReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public Integer read(Integer reuse) { - // Discard microseconds since Flink uses millisecond unit for TIME type. - return (int) Math.floorDiv(column.nextLong(), 1000L); - } - } - - private static class MillisTimeReader extends ParquetValueReaders.PrimitiveReader { - MillisTimeReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public Integer read(Integer reuse) { - return (int) column.nextLong(); - } - } - - private static class ArrayReader - extends ParquetValueReaders.RepeatedReader { - private int readPos = 0; - private int writePos = 0; - - ArrayReader(int definitionLevel, int repetitionLevel, ParquetValueReader reader) { - super(definitionLevel, repetitionLevel, reader); - } - - @Override - protected ReusableArrayData newListData(ArrayData reuse) { - this.readPos = 0; - this.writePos = 0; - - if (reuse instanceof ReusableArrayData) { - return (ReusableArrayData) reuse; - } else { - return new ReusableArrayData(); - } - } - - @Override - @SuppressWarnings("unchecked") - protected E getElement(ReusableArrayData list) { - E value = null; - if (readPos < list.capacity()) { - value = (E) list.values[readPos]; - } - - readPos += 1; - - return value; - } - - @Override - protected void addElement(ReusableArrayData reused, E element) { - if (writePos >= reused.capacity()) { - reused.grow(); - } - - reused.values[writePos] = element; - - writePos += 1; - } - - @Override - protected ArrayData buildList(ReusableArrayData list) { - // Since ReusableArrayData is not accepted by Flink, use GenericArrayData temporarily to walk - // around it. - // Revert this to use ReusableArrayData once it is fixed in Flink. - // For your reference, https://issues.apache.org/jira/browse/FLINK-25238. - return new GenericArrayData(Arrays.copyOf(list.values, writePos)); - } - } - - private static class MapReader - extends ParquetValueReaders.RepeatedKeyValueReader { - private int readPos = 0; - private int writePos = 0; - - private final ParquetValueReaders.ReusableEntry entry = - new ParquetValueReaders.ReusableEntry<>(); - private final ParquetValueReaders.ReusableEntry nullEntry = - new ParquetValueReaders.ReusableEntry<>(); - - MapReader( - int definitionLevel, - int repetitionLevel, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { - super(definitionLevel, repetitionLevel, keyReader, valueReader); - } - - @Override - protected ReusableMapData newMapData(MapData reuse) { - this.readPos = 0; - this.writePos = 0; - - if (reuse instanceof ReusableMapData) { - return (ReusableMapData) reuse; - } else { - return new ReusableMapData(); - } - } - - @Override - @SuppressWarnings("unchecked") - protected Map.Entry getPair(ReusableMapData map) { - Map.Entry kv = nullEntry; - if (readPos < map.capacity()) { - entry.set((K) map.keys.values[readPos], (V) map.values.values[readPos]); - kv = entry; - } - - readPos += 1; - - return kv; - } - - @Override - protected void addPair(ReusableMapData map, K key, V value) { - if (writePos >= map.capacity()) { - map.grow(); - } - - map.keys.values[writePos] = key; - map.values.values[writePos] = value; - - writePos += 1; - } - - @Override - protected MapData buildMap(ReusableMapData map) { - map.setNumElements(writePos); - return map; - } - } - - private static class RowDataReader - extends ParquetValueReaders.StructReader { - private final int numFields; - - RowDataReader(List> readers) { - super(readers); - this.numFields = readers.size(); - } - - @Override - protected GenericRowData newStructData(RowData reuse) { - if (reuse instanceof GenericRowData) { - return (GenericRowData) reuse; - } else { - return new GenericRowData(numFields); - } - } - - @Override - protected Object getField(GenericRowData intermediate, int pos) { - return intermediate.getField(pos); - } - - @Override - protected RowData buildStruct(GenericRowData struct) { - return struct; - } - - @Override - protected void set(GenericRowData row, int pos, Object value) { - row.setField(pos, value); - } - - @Override - protected void setNull(GenericRowData row, int pos) { - row.setField(pos, null); - } - - @Override - protected void setBoolean(GenericRowData row, int pos, boolean value) { - row.setField(pos, value); - } - - @Override - protected void setInteger(GenericRowData row, int pos, int value) { - row.setField(pos, value); - } - - @Override - protected void setLong(GenericRowData row, int pos, long value) { - row.setField(pos, value); - } - - @Override - protected void setFloat(GenericRowData row, int pos, float value) { - row.setField(pos, value); - } - - @Override - protected void setDouble(GenericRowData row, int pos, double value) { - row.setField(pos, value); - } - } - - private static class ReusableMapData implements MapData { - private final ReusableArrayData keys; - private final ReusableArrayData values; - - private int numElements; - - private ReusableMapData() { - this.keys = new ReusableArrayData(); - this.values = new ReusableArrayData(); - } - - private void grow() { - keys.grow(); - values.grow(); - } - - private int capacity() { - return keys.capacity(); - } - - public void setNumElements(int numElements) { - this.numElements = numElements; - keys.setNumElements(numElements); - values.setNumElements(numElements); - } - - @Override - public int size() { - return numElements; - } - - @Override - public ReusableArrayData keyArray() { - return keys; - } - - @Override - public ReusableArrayData valueArray() { - return values; - } - } - - private static class ReusableArrayData implements ArrayData { - private static final Object[] EMPTY = new Object[0]; - - private Object[] values = EMPTY; - private int numElements = 0; - - private void grow() { - if (values.length == 0) { - this.values = new Object[20]; - } else { - Object[] old = values; - this.values = new Object[old.length << 1]; - // copy the old array in case it has values that can be reused - System.arraycopy(old, 0, values, 0, old.length); - } - } - - private int capacity() { - return values.length; - } - - public void setNumElements(int numElements) { - this.numElements = numElements; - } - - @Override - public int size() { - return numElements; - } - - @Override - public boolean isNullAt(int ordinal) { - return null == values[ordinal]; - } - - @Override - public boolean getBoolean(int ordinal) { - return (boolean) values[ordinal]; - } - - @Override - public byte getByte(int ordinal) { - return (byte) values[ordinal]; - } - - @Override - public short getShort(int ordinal) { - return (short) values[ordinal]; - } - - @Override - public int getInt(int ordinal) { - return (int) values[ordinal]; - } - - @Override - public long getLong(int ordinal) { - return (long) values[ordinal]; - } - - @Override - public float getFloat(int ordinal) { - return (float) values[ordinal]; - } - - @Override - public double getDouble(int ordinal) { - return (double) values[ordinal]; - } - - @Override - public StringData getString(int pos) { - return (StringData) values[pos]; - } - - @Override - public DecimalData getDecimal(int pos, int precision, int scale) { - return (DecimalData) values[pos]; - } - - @Override - public TimestampData getTimestamp(int pos, int precision) { - return (TimestampData) values[pos]; - } - - @SuppressWarnings("unchecked") - @Override - public RawValueData getRawValue(int pos) { - return (RawValueData) values[pos]; - } - - @Override - public byte[] getBinary(int ordinal) { - return (byte[]) values[ordinal]; - } - - @Override - public ArrayData getArray(int ordinal) { - return (ArrayData) values[ordinal]; - } - - @Override - public MapData getMap(int ordinal) { - return (MapData) values[ordinal]; - } - - @Override - public RowData getRow(int pos, int numFields) { - return (RowData) values[pos]; - } - - @Override - public boolean[] toBooleanArray() { - return ArrayUtil.toPrimitive((Boolean[]) values); - } - - @Override - public byte[] toByteArray() { - return ArrayUtil.toPrimitive((Byte[]) values); - } - - @Override - public short[] toShortArray() { - return ArrayUtil.toPrimitive((Short[]) values); - } - - @Override - public int[] toIntArray() { - return ArrayUtil.toPrimitive((Integer[]) values); - } - - @Override - public long[] toLongArray() { - return ArrayUtil.toPrimitive((Long[]) values); - } - - @Override - public float[] toFloatArray() { - return ArrayUtil.toPrimitive((Float[]) values); - } - - @Override - public double[] toDoubleArray() { - return ArrayUtil.toPrimitive((Double[]) values); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java deleted file mode 100644 index 5c90252723bd..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java +++ /dev/null @@ -1,608 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.NoSuchElementException; -import java.util.Optional; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.LogicalTypeRoot; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.RowType.RowField; -import org.apache.flink.table.types.logical.SmallIntType; -import org.apache.flink.table.types.logical.TinyIntType; -import org.apache.iceberg.flink.FlinkRowData; -import org.apache.iceberg.parquet.ParquetValueReaders; -import org.apache.iceberg.parquet.ParquetValueWriter; -import org.apache.iceberg.parquet.ParquetValueWriters; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.DecimalUtil; -import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.io.api.Binary; -import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.LogicalTypeAnnotation; -import org.apache.parquet.schema.LogicalTypeAnnotation.BsonLogicalTypeAnnotation; -import org.apache.parquet.schema.LogicalTypeAnnotation.DateLogicalTypeAnnotation; -import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; -import org.apache.parquet.schema.LogicalTypeAnnotation.EnumLogicalTypeAnnotation; -import org.apache.parquet.schema.LogicalTypeAnnotation.IntLogicalTypeAnnotation; -import org.apache.parquet.schema.LogicalTypeAnnotation.JsonLogicalTypeAnnotation; -import org.apache.parquet.schema.LogicalTypeAnnotation.LogicalTypeAnnotationVisitor; -import org.apache.parquet.schema.LogicalTypeAnnotation.StringLogicalTypeAnnotation; -import org.apache.parquet.schema.LogicalTypeAnnotation.TimeLogicalTypeAnnotation; -import org.apache.parquet.schema.LogicalTypeAnnotation.TimestampLogicalTypeAnnotation; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; - -public class FlinkParquetWriters { - private FlinkParquetWriters() {} - - @SuppressWarnings("unchecked") - public static ParquetValueWriter buildWriter(LogicalType schema, MessageType type) { - return (ParquetValueWriter) - ParquetWithFlinkSchemaVisitor.visit(schema, type, new WriteBuilder(type)); - } - - private static class WriteBuilder extends ParquetWithFlinkSchemaVisitor> { - private final MessageType type; - - WriteBuilder(MessageType type) { - this.type = type; - } - - @Override - public ParquetValueWriter message( - RowType sStruct, MessageType message, List> fields) { - return struct(sStruct, message.asGroupType(), fields); - } - - @Override - public ParquetValueWriter struct( - RowType sStruct, GroupType struct, List> fieldWriters) { - List flinkFields = sStruct.getFields(); - List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); - List flinkTypes = Lists.newArrayList(); - int[] fieldIndexes = new int[fieldWriters.size()]; - int fieldIndex = 0; - for (int i = 0; i < flinkFields.size(); i += 1) { - LogicalType flinkType = flinkFields.get(i).getType(); - if (!flinkType.is(LogicalTypeRoot.NULL)) { - writers.add(newOption(struct.getType(fieldIndex), fieldWriters.get(fieldIndex))); - flinkTypes.add(flinkType); - fieldIndexes[fieldIndex] = i; - fieldIndex += 1; - } - } - - return new RowDataWriter(fieldIndexes, writers, flinkTypes); - } - - @Override - public ParquetValueWriter list( - ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { - GroupType repeated = array.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath); - int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - - return new ArrayDataWriter<>( - repeatedD, - repeatedR, - newOption(repeated.getType(0), elementWriter), - sArray.getElementType()); - } - - @Override - public ParquetValueWriter map( - MapType sMap, - GroupType map, - ParquetValueWriter keyWriter, - ParquetValueWriter valueWriter) { - GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath); - int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - - return new MapDataWriter<>( - repeatedD, - repeatedR, - newOption(repeatedKeyValue.getType(0), keyWriter), - newOption(repeatedKeyValue.getType(1), valueWriter), - sMap.getKeyType(), - sMap.getValueType()); - } - - private ParquetValueWriter newOption(Type fieldType, ParquetValueWriter writer) { - int maxD = type.getMaxDefinitionLevel(path(fieldType.getName())); - return ParquetValueWriters.option(fieldType, maxD, writer); - } - - @Override - public ParquetValueWriter primitive(LogicalType fType, PrimitiveType primitive) { - ColumnDescriptor desc = type.getColumnDescription(currentPath()); - - LogicalTypeAnnotation annotation = primitive.getLogicalTypeAnnotation(); - if (annotation != null) { - Optional> writer = - annotation.accept(new LogicalTypeWriterBuilder(fType, desc)); - if (writer.isPresent()) { - return writer.get(); - } else { - throw new UnsupportedOperationException( - "Unsupported logical type: " + primitive.getOriginalType()); - } - } - - switch (primitive.getPrimitiveTypeName()) { - case FIXED_LEN_BYTE_ARRAY: - case BINARY: - return byteArrays(desc); - case BOOLEAN: - return ParquetValueWriters.booleans(desc); - case INT32: - return ints(fType, desc); - case INT64: - return ParquetValueWriters.longs(desc); - case FLOAT: - return ParquetValueWriters.floats(desc); - case DOUBLE: - return ParquetValueWriters.doubles(desc); - default: - throw new UnsupportedOperationException("Unsupported type: " + primitive); - } - } - } - - private static class LogicalTypeWriterBuilder - implements LogicalTypeAnnotationVisitor> { - private final LogicalType flinkType; - private final ColumnDescriptor desc; - - private LogicalTypeWriterBuilder(LogicalType flinkType, ColumnDescriptor desc) { - this.flinkType = flinkType; - this.desc = desc; - } - - @Override - public Optional> visit(StringLogicalTypeAnnotation strings) { - return Optional.of(strings(desc)); - } - - @Override - public Optional> visit(EnumLogicalTypeAnnotation enums) { - return Optional.of(strings(desc)); - } - - @Override - public Optional> visit(DecimalLogicalTypeAnnotation decimal) { - ParquetValueWriter writer; - switch (desc.getPrimitiveType().getPrimitiveTypeName()) { - case INT32: - writer = decimalAsInteger(desc, decimal.getPrecision(), decimal.getScale()); - break; - case INT64: - writer = decimalAsLong(desc, decimal.getPrecision(), decimal.getScale()); - break; - case BINARY: - case FIXED_LEN_BYTE_ARRAY: - writer = decimalAsFixed(desc, decimal.getPrecision(), decimal.getScale()); - break; - default: - throw new UnsupportedOperationException( - "Unsupported base type for decimal: " - + desc.getPrimitiveType().getPrimitiveTypeName()); - } - return Optional.of(writer); - } - - @Override - public Optional> visit(DateLogicalTypeAnnotation dates) { - return Optional.of(ints(flinkType, desc)); - } - - @Override - public Optional> visit(TimeLogicalTypeAnnotation times) { - Preconditions.checkArgument( - LogicalTypeAnnotation.TimeUnit.MICROS.equals(times.getUnit()), - "Cannot write time in %s, only MICROS is supported", - times.getUnit()); - return Optional.of(timeMicros(desc)); - } - - @Override - public Optional> visit(TimestampLogicalTypeAnnotation timestamps) { - ParquetValueWriter writer; - switch (timestamps.getUnit()) { - case NANOS: - writer = timestampNanos(desc); - break; - case MICROS: - writer = timestamps(desc); - break; - default: - throw new UnsupportedOperationException("Unsupported timestamp type: " + timestamps); - } - - return Optional.of(writer); - } - - @Override - public Optional> visit(IntLogicalTypeAnnotation type) { - Preconditions.checkArgument(type.isSigned(), "Cannot write unsigned integer type: %s", type); - ParquetValueWriter writer; - if (type.getBitWidth() < 64) { - writer = ints(flinkType, desc); - } else { - writer = ParquetValueWriters.longs(desc); - } - - return Optional.of(writer); - } - - @Override - public Optional> visit(JsonLogicalTypeAnnotation ignored) { - return Optional.of(strings(desc)); - } - - @Override - public Optional> visit(BsonLogicalTypeAnnotation ignored) { - return Optional.of(byteArrays(desc)); - } - } - - private static ParquetValueWriter ints(LogicalType type, ColumnDescriptor desc) { - if (type instanceof TinyIntType) { - return ParquetValueWriters.tinyints(desc); - } else if (type instanceof SmallIntType) { - return ParquetValueWriters.shorts(desc); - } - return ParquetValueWriters.ints(desc); - } - - private static ParquetValueWriter strings(ColumnDescriptor desc) { - return new StringDataWriter(desc); - } - - private static ParquetValueWriter timeMicros(ColumnDescriptor desc) { - return new TimeMicrosWriter(desc); - } - - private static ParquetValueWriter decimalAsInteger( - ColumnDescriptor desc, int precision, int scale) { - Preconditions.checkArgument( - precision <= 9, - "Cannot write decimal value as integer with precision larger than 9," - + " wrong precision %s", - precision); - return new IntegerDecimalWriter(desc, precision, scale); - } - - private static ParquetValueWriter decimalAsLong( - ColumnDescriptor desc, int precision, int scale) { - Preconditions.checkArgument( - precision <= 18, - "Cannot write decimal value as long with precision larger than 18, " - + " wrong precision %s", - precision); - return new LongDecimalWriter(desc, precision, scale); - } - - private static ParquetValueWriter decimalAsFixed( - ColumnDescriptor desc, int precision, int scale) { - return new FixedDecimalWriter(desc, precision, scale); - } - - private static ParquetValueWriter timestamps(ColumnDescriptor desc) { - return new TimestampDataWriter(desc); - } - - private static ParquetValueWriter timestampNanos(ColumnDescriptor desc) { - return new TimestampNanoDataWriter(desc); - } - - private static ParquetValueWriter byteArrays(ColumnDescriptor desc) { - return new ByteArrayWriter(desc); - } - - private static class StringDataWriter extends ParquetValueWriters.PrimitiveWriter { - private StringDataWriter(ColumnDescriptor desc) { - super(desc); - } - - @Override - public void write(int repetitionLevel, StringData value) { - column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(value.toBytes())); - } - } - - private static class TimeMicrosWriter extends ParquetValueWriters.PrimitiveWriter { - private TimeMicrosWriter(ColumnDescriptor desc) { - super(desc); - } - - @Override - public void write(int repetitionLevel, Integer value) { - long micros = value.longValue() * 1000; - column.writeLong(repetitionLevel, micros); - } - } - - private static class IntegerDecimalWriter - extends ParquetValueWriters.PrimitiveWriter { - private final int precision; - private final int scale; - - private IntegerDecimalWriter(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public void write(int repetitionLevel, DecimalData decimal) { - Preconditions.checkArgument( - decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", - precision, - scale, - decimal); - Preconditions.checkArgument( - decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", - precision, - scale, - decimal); - - column.writeInteger(repetitionLevel, (int) decimal.toUnscaledLong()); - } - } - - private static class LongDecimalWriter extends ParquetValueWriters.PrimitiveWriter { - private final int precision; - private final int scale; - - private LongDecimalWriter(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public void write(int repetitionLevel, DecimalData decimal) { - Preconditions.checkArgument( - decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", - precision, - scale, - decimal); - Preconditions.checkArgument( - decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", - precision, - scale, - decimal); - - column.writeLong(repetitionLevel, decimal.toUnscaledLong()); - } - } - - private static class FixedDecimalWriter extends ParquetValueWriters.PrimitiveWriter { - private final int precision; - private final int scale; - private final ThreadLocal bytes; - - private FixedDecimalWriter(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - this.bytes = - ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); - } - - @Override - public void write(int repetitionLevel, DecimalData decimal) { - byte[] binary = - DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal.toBigDecimal(), bytes.get()); - column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(binary)); - } - } - - private static class TimestampDataWriter - extends ParquetValueWriters.PrimitiveWriter { - private TimestampDataWriter(ColumnDescriptor desc) { - super(desc); - } - - @Override - public void write(int repetitionLevel, TimestampData value) { - column.writeLong( - repetitionLevel, value.getMillisecond() * 1000 + value.getNanoOfMillisecond() / 1000); - } - } - - private static class TimestampNanoDataWriter - extends ParquetValueWriters.PrimitiveWriter { - private TimestampNanoDataWriter(ColumnDescriptor desc) { - super(desc); - } - - @Override - public void write(int repetitionLevel, TimestampData value) { - column.writeLong( - repetitionLevel, value.getMillisecond() * 1_000_000L + value.getNanoOfMillisecond()); - } - } - - private static class ByteArrayWriter extends ParquetValueWriters.PrimitiveWriter { - private ByteArrayWriter(ColumnDescriptor desc) { - super(desc); - } - - @Override - public void write(int repetitionLevel, byte[] bytes) { - column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(bytes)); - } - } - - private static class ArrayDataWriter extends ParquetValueWriters.RepeatedWriter { - private final LogicalType elementType; - - private ArrayDataWriter( - int definitionLevel, - int repetitionLevel, - ParquetValueWriter writer, - LogicalType elementType) { - super(definitionLevel, repetitionLevel, writer); - this.elementType = elementType; - } - - @Override - protected Iterator elements(ArrayData list) { - return new ElementIterator<>(list); - } - - private class ElementIterator implements Iterator { - private final int size; - private final ArrayData list; - private final ArrayData.ElementGetter getter; - private int index; - - private ElementIterator(ArrayData list) { - this.list = list; - size = list.size(); - getter = ArrayData.createElementGetter(elementType); - index = 0; - } - - @Override - public boolean hasNext() { - return index != size; - } - - @Override - @SuppressWarnings("unchecked") - public E next() { - if (index >= size) { - throw new NoSuchElementException(); - } - - E element = (E) getter.getElementOrNull(list, index); - index += 1; - - return element; - } - } - } - - private static class MapDataWriter - extends ParquetValueWriters.RepeatedKeyValueWriter { - private final LogicalType keyType; - private final LogicalType valueType; - - private MapDataWriter( - int definitionLevel, - int repetitionLevel, - ParquetValueWriter keyWriter, - ParquetValueWriter valueWriter, - LogicalType keyType, - LogicalType valueType) { - super(definitionLevel, repetitionLevel, keyWriter, valueWriter); - this.keyType = keyType; - this.valueType = valueType; - } - - @Override - protected Iterator> pairs(MapData map) { - return new EntryIterator<>(map); - } - - private class EntryIterator implements Iterator> { - private final int size; - private final ArrayData keys; - private final ArrayData values; - private final ParquetValueReaders.ReusableEntry entry; - private final ArrayData.ElementGetter keyGetter; - private final ArrayData.ElementGetter valueGetter; - private int index; - - private EntryIterator(MapData map) { - size = map.size(); - keys = map.keyArray(); - values = map.valueArray(); - entry = new ParquetValueReaders.ReusableEntry<>(); - keyGetter = ArrayData.createElementGetter(keyType); - valueGetter = ArrayData.createElementGetter(valueType); - index = 0; - } - - @Override - public boolean hasNext() { - return index != size; - } - - @Override - @SuppressWarnings("unchecked") - public Map.Entry next() { - if (index >= size) { - throw new NoSuchElementException(); - } - - entry.set( - (K) keyGetter.getElementOrNull(keys, index), - (V) valueGetter.getElementOrNull(values, index)); - index += 1; - - return entry; - } - } - } - - private static class RowDataWriter extends ParquetValueWriters.StructWriter { - private final RowData.FieldGetter[] fieldGetter; - - RowDataWriter( - int[] fieldIndexes, List> writers, List types) { - super(writers); - fieldGetter = new RowData.FieldGetter[types.size()]; - for (int i = 0; i < types.size(); i += 1) { - fieldGetter[i] = FlinkRowData.createFieldGetter(types.get(i), fieldIndexes[i]); - } - } - - @Override - protected Object get(RowData struct, int index) { - return fieldGetter[index].getFieldOrNull(struct); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkPlannedAvroReader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkPlannedAvroReader.java deleted file mode 100644 index edc7041a4d04..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkPlannedAvroReader.java +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.function.Supplier; -import org.apache.avro.LogicalType; -import org.apache.avro.LogicalTypes; -import org.apache.avro.Schema; -import org.apache.avro.io.DatumReader; -import org.apache.avro.io.Decoder; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.avro.AvroWithPartnerVisitor; -import org.apache.iceberg.avro.SupportsRowPosition; -import org.apache.iceberg.avro.ValueReader; -import org.apache.iceberg.avro.ValueReaders; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.Pair; - -public class FlinkPlannedAvroReader implements DatumReader, SupportsRowPosition { - - private final Types.StructType expectedType; - private final Map idToConstant; - private ValueReader reader; - - public static FlinkPlannedAvroReader create(org.apache.iceberg.Schema schema) { - return create(schema, ImmutableMap.of()); - } - - public static FlinkPlannedAvroReader create( - org.apache.iceberg.Schema schema, Map constants) { - return new FlinkPlannedAvroReader(schema, constants); - } - - private FlinkPlannedAvroReader( - org.apache.iceberg.Schema expectedSchema, Map constants) { - this.expectedType = expectedSchema.asStruct(); - this.idToConstant = constants; - } - - @Override - @SuppressWarnings("unchecked") - public void setSchema(Schema fileSchema) { - this.reader = - (ValueReader) - AvroWithPartnerVisitor.visit( - expectedType, - fileSchema, - new ReadBuilder(idToConstant), - AvroWithPartnerVisitor.FieldIDAccessors.get()); - } - - @Override - public RowData read(RowData reuse, Decoder decoder) throws IOException { - return reader.read(decoder, reuse); - } - - @Override - public void setRowPositionSupplier(Supplier posSupplier) { - if (reader instanceof SupportsRowPosition) { - ((SupportsRowPosition) reader).setRowPositionSupplier(posSupplier); - } - } - - private static class ReadBuilder extends AvroWithPartnerVisitor> { - private final Map idToConstant; - - private ReadBuilder(Map idToConstant) { - this.idToConstant = idToConstant; - } - - @Override - public ValueReader record(Type partner, Schema record, List> fieldReaders) { - if (partner == null) { - return ValueReaders.skipStruct(fieldReaders); - } - - Types.StructType expected = partner.asStructType(); - List>> readPlan = - ValueReaders.buildReadPlan( - expected, record, fieldReaders, idToConstant, RowDataUtil::convertConstant); - - // TODO: should this pass expected so that struct.get can reuse containers? - return FlinkValueReaders.struct(readPlan, expected.fields().size()); - } - - @Override - public ValueReader union(Type partner, Schema union, List> options) { - return ValueReaders.union(options); - } - - @Override - public ValueReader array(Type partner, Schema array, ValueReader elementReader) { - return FlinkValueReaders.array(elementReader); - } - - @Override - public ValueReader arrayMap( - Type partner, Schema map, ValueReader keyReader, ValueReader valueReader) { - return FlinkValueReaders.arrayMap(keyReader, valueReader); - } - - @Override - public ValueReader map(Type partner, Schema map, ValueReader valueReader) { - return FlinkValueReaders.map(FlinkValueReaders.strings(), valueReader); - } - - @Override - public ValueReader primitive(Type partner, Schema primitive) { - LogicalType logicalType = primitive.getLogicalType(); - if (logicalType != null) { - switch (logicalType.getName()) { - case "date": - // Flink uses the same representation - return ValueReaders.ints(); - - case "time-micros": - return FlinkValueReaders.timeMicros(); - - case "timestamp-millis": - return FlinkValueReaders.timestampMills(); - - case "timestamp-micros": - return FlinkValueReaders.timestampMicros(); - - case "timestamp-nanos": - return FlinkValueReaders.timestampNanos(); - - case "decimal": - LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType; - return FlinkValueReaders.decimal( - ValueReaders.decimalBytesReader(primitive), - decimal.getPrecision(), - decimal.getScale()); - - case "uuid": - return FlinkValueReaders.uuids(); - - default: - throw new IllegalArgumentException("Unknown logical type: " + logicalType.getName()); - } - } - - switch (primitive.getType()) { - case NULL: - return ValueReaders.nulls(); - case BOOLEAN: - return ValueReaders.booleans(); - case INT: - if (partner != null && partner.typeId() == Type.TypeID.LONG) { - return ValueReaders.intsAsLongs(); - } - return ValueReaders.ints(); - case LONG: - return ValueReaders.longs(); - case FLOAT: - if (partner != null && partner.typeId() == Type.TypeID.DOUBLE) { - return ValueReaders.floatsAsDoubles(); - } - return ValueReaders.floats(); - case DOUBLE: - return ValueReaders.doubles(); - case STRING: - return FlinkValueReaders.strings(); - case FIXED: - return ValueReaders.fixed(primitive.getFixedSize()); - case BYTES: - return ValueReaders.bytes(); - case ENUM: - return FlinkValueReaders.enums(primitive.getEnumSymbols()); - default: - throw new IllegalArgumentException("Unsupported type: " + primitive); - } - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java deleted file mode 100644 index ba4e1a7a7aec..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.util.List; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; - -abstract class FlinkSchemaVisitor { - - static T visit(RowType flinkType, Schema schema, FlinkSchemaVisitor visitor) { - return visit(flinkType, schema.asStruct(), visitor); - } - - private static T visit(LogicalType flinkType, Type iType, FlinkSchemaVisitor visitor) { - switch (iType.typeId()) { - case STRUCT: - return visitRecord(flinkType, iType.asStructType(), visitor); - - case MAP: - MapType mapType = (MapType) flinkType; - Types.MapType iMapType = iType.asMapType(); - T key; - T value; - - Types.NestedField keyField = iMapType.field(iMapType.keyId()); - visitor.beforeMapKey(keyField); - try { - key = visit(mapType.getKeyType(), iMapType.keyType(), visitor); - } finally { - visitor.afterMapKey(keyField); - } - - Types.NestedField valueField = iMapType.field(iMapType.valueId()); - visitor.beforeMapValue(valueField); - try { - value = visit(mapType.getValueType(), iMapType.valueType(), visitor); - } finally { - visitor.afterMapValue(valueField); - } - - return visitor.map(iMapType, key, value, mapType.getKeyType(), mapType.getValueType()); - - case LIST: - ArrayType listType = (ArrayType) flinkType; - Types.ListType iListType = iType.asListType(); - T element; - - Types.NestedField elementField = iListType.field(iListType.elementId()); - visitor.beforeListElement(elementField); - try { - element = visit(listType.getElementType(), iListType.elementType(), visitor); - } finally { - visitor.afterListElement(elementField); - } - - return visitor.list(iListType, element, listType.getElementType()); - - default: - return visitor.primitive(iType.asPrimitiveType(), flinkType); - } - } - - private static T visitRecord( - LogicalType flinkType, Types.StructType struct, FlinkSchemaVisitor visitor) { - Preconditions.checkArgument(flinkType instanceof RowType, "%s is not a RowType.", flinkType); - RowType rowType = (RowType) flinkType; - - int fieldSize = struct.fields().size(); - List results = Lists.newArrayListWithExpectedSize(fieldSize); - List fieldTypes = Lists.newArrayListWithExpectedSize(fieldSize); - List nestedFields = struct.fields(); - - for (int i = 0; i < fieldSize; i++) { - Types.NestedField iField = nestedFields.get(i); - int fieldIndex = rowType.getFieldIndex(iField.name()); - Preconditions.checkArgument( - fieldIndex >= 0, "NestedField: %s is not found in flink RowType: %s", iField, rowType); - - LogicalType fieldFlinkType = rowType.getTypeAt(fieldIndex); - - fieldTypes.add(fieldFlinkType); - - visitor.beforeField(iField); - try { - results.add(visit(fieldFlinkType, iField.type(), visitor)); - } finally { - visitor.afterField(iField); - } - } - - return visitor.record(struct, results, fieldTypes); - } - - public T record(Types.StructType iStruct, List results, List fieldTypes) { - return null; - } - - public T list(Types.ListType iList, T element, LogicalType elementType) { - return null; - } - - public T map(Types.MapType iMap, T key, T value, LogicalType keyType, LogicalType valueType) { - return null; - } - - public T primitive(Type.PrimitiveType iPrimitive, LogicalType flinkPrimitive) { - return null; - } - - public void beforeField(Types.NestedField field) {} - - public void afterField(Types.NestedField field) {} - - public void beforeListElement(Types.NestedField elementField) { - beforeField(elementField); - } - - public void afterListElement(Types.NestedField elementField) { - afterField(elementField); - } - - public void beforeMapKey(Types.NestedField keyField) { - beforeField(keyField); - } - - public void afterMapKey(Types.NestedField keyField) { - afterField(keyField); - } - - public void beforeMapValue(Types.NestedField valueField) { - beforeField(valueField); - } - - public void afterMapValue(Types.NestedField valueField) { - afterField(valueField); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java deleted file mode 100644 index 80b36d939ece..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java +++ /dev/null @@ -1,356 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.io.IOException; -import java.math.BigDecimal; -import java.math.BigInteger; -import java.nio.charset.StandardCharsets; -import java.util.List; -import java.util.Map; -import org.apache.avro.io.Decoder; -import org.apache.avro.util.Utf8; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.iceberg.avro.ValueReader; -import org.apache.iceberg.avro.ValueReaders; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.Pair; - -public class FlinkValueReaders { - - private FlinkValueReaders() {} - - static ValueReader strings() { - return StringReader.INSTANCE; - } - - static ValueReader enums(List symbols) { - return new EnumReader(symbols); - } - - static ValueReader uuids() { - return ValueReaders.fixed(16); - } - - static ValueReader timeMicros() { - return TimeMicrosReader.INSTANCE; - } - - static ValueReader timestampMills() { - return TimestampMillsReader.INSTANCE; - } - - static ValueReader timestampMicros() { - return TimestampMicrosReader.INSTANCE; - } - - static ValueReader timestampNanos() { - return TimestampNanosReader.INSTANCE; - } - - static ValueReader decimal( - ValueReader unscaledReader, int precision, int scale) { - return new DecimalReader(unscaledReader, precision, scale); - } - - static ValueReader array(ValueReader elementReader) { - return new ArrayReader(elementReader); - } - - static ValueReader arrayMap(ValueReader keyReader, ValueReader valueReader) { - return new ArrayMapReader(keyReader, valueReader); - } - - static ValueReader map(ValueReader keyReader, ValueReader valueReader) { - return new MapReader(keyReader, valueReader); - } - - static ValueReader struct(List>> readPlan, int numFields) { - return new PlannedStructReader(readPlan, numFields); - } - - static ValueReader struct( - List> readers, Types.StructType struct, Map idToConstant) { - return new StructReader(readers, struct, idToConstant); - } - - private static class StringReader implements ValueReader { - private static final StringReader INSTANCE = new StringReader(); - - private StringReader() {} - - @Override - public StringData read(Decoder decoder, Object reuse) throws IOException { - // use the decoder's readString(Utf8) method because it may be a resolving decoder - Utf8 utf8 = null; - if (reuse instanceof StringData) { - utf8 = new Utf8(((StringData) reuse).toBytes()); - } - - Utf8 string = decoder.readString(utf8); - return StringData.fromBytes(string.getBytes(), 0, string.getByteLength()); - } - } - - private static class EnumReader implements ValueReader { - private final StringData[] symbols; - - private EnumReader(List symbols) { - this.symbols = new StringData[symbols.size()]; - for (int i = 0; i < this.symbols.length; i += 1) { - this.symbols[i] = StringData.fromBytes(symbols.get(i).getBytes(StandardCharsets.UTF_8)); - } - } - - @Override - public StringData read(Decoder decoder, Object ignore) throws IOException { - int index = decoder.readEnum(); - return symbols[index]; - } - } - - private static class DecimalReader implements ValueReader { - private final ValueReader bytesReader; - private final int precision; - private final int scale; - - private DecimalReader(ValueReader bytesReader, int precision, int scale) { - this.bytesReader = bytesReader; - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData read(Decoder decoder, Object reuse) throws IOException { - byte[] bytes = bytesReader.read(decoder, null); - return DecimalData.fromBigDecimal( - new BigDecimal(new BigInteger(bytes), scale), precision, scale); - } - } - - private static class TimeMicrosReader implements ValueReader { - private static final TimeMicrosReader INSTANCE = new TimeMicrosReader(); - - @Override - public Integer read(Decoder decoder, Object reuse) throws IOException { - long micros = decoder.readLong(); - // Flink only support time mills, just erase micros. - return (int) (micros / 1000); - } - } - - private static class TimestampMillsReader implements ValueReader { - private static final TimestampMillsReader INSTANCE = new TimestampMillsReader(); - - @Override - public TimestampData read(Decoder decoder, Object reuse) throws IOException { - return TimestampData.fromEpochMillis(decoder.readLong()); - } - } - - private static class TimestampMicrosReader implements ValueReader { - private static final TimestampMicrosReader INSTANCE = new TimestampMicrosReader(); - - @Override - public TimestampData read(Decoder decoder, Object reuse) throws IOException { - long micros = decoder.readLong(); - long mills = Math.floorDiv(micros, 1000); - int nanos = Math.floorMod(micros, 1000) * 1000; - return TimestampData.fromEpochMillis(mills, nanos); - } - } - - private static class TimestampNanosReader implements ValueReader { - private static final TimestampNanosReader INSTANCE = new TimestampNanosReader(); - - @Override - public TimestampData read(Decoder decoder, Object reuse) throws IOException { - long nanos = decoder.readLong(); - long mills = Math.floorDiv(nanos, 1_000_000); - int leftover = Math.floorMod(nanos, 1_000_000); - return TimestampData.fromEpochMillis(mills, leftover); - } - } - - private static class ArrayReader implements ValueReader { - private final ValueReader elementReader; - private final List reusedList = Lists.newArrayList(); - - private ArrayReader(ValueReader elementReader) { - this.elementReader = elementReader; - } - - @Override - public GenericArrayData read(Decoder decoder, Object reuse) throws IOException { - reusedList.clear(); - long chunkLength = decoder.readArrayStart(); - - while (chunkLength > 0) { - for (int i = 0; i < chunkLength; i += 1) { - reusedList.add(elementReader.read(decoder, null)); - } - - chunkLength = decoder.arrayNext(); - } - - // this will convert the list to an array so it is okay to reuse the list - return new GenericArrayData(reusedList.toArray()); - } - } - - private static MapData kvArrayToMap(List keyList, List valueList) { - Map map = Maps.newHashMap(); - Object[] keys = keyList.toArray(); - Object[] values = valueList.toArray(); - for (int i = 0; i < keys.length; i++) { - map.put(keys[i], values[i]); - } - - return new GenericMapData(map); - } - - private static class ArrayMapReader implements ValueReader { - private final ValueReader keyReader; - private final ValueReader valueReader; - - private final List reusedKeyList = Lists.newArrayList(); - private final List reusedValueList = Lists.newArrayList(); - - private ArrayMapReader(ValueReader keyReader, ValueReader valueReader) { - this.keyReader = keyReader; - this.valueReader = valueReader; - } - - @Override - public MapData read(Decoder decoder, Object reuse) throws IOException { - reusedKeyList.clear(); - reusedValueList.clear(); - - long chunkLength = decoder.readArrayStart(); - - while (chunkLength > 0) { - for (int i = 0; i < chunkLength; i += 1) { - reusedKeyList.add(keyReader.read(decoder, null)); - reusedValueList.add(valueReader.read(decoder, null)); - } - - chunkLength = decoder.arrayNext(); - } - - return kvArrayToMap(reusedKeyList, reusedValueList); - } - } - - private static class MapReader implements ValueReader { - private final ValueReader keyReader; - private final ValueReader valueReader; - - private final List reusedKeyList = Lists.newArrayList(); - private final List reusedValueList = Lists.newArrayList(); - - private MapReader(ValueReader keyReader, ValueReader valueReader) { - this.keyReader = keyReader; - this.valueReader = valueReader; - } - - @Override - public MapData read(Decoder decoder, Object reuse) throws IOException { - reusedKeyList.clear(); - reusedValueList.clear(); - - long chunkLength = decoder.readMapStart(); - - while (chunkLength > 0) { - for (int i = 0; i < chunkLength; i += 1) { - reusedKeyList.add(keyReader.read(decoder, null)); - reusedValueList.add(valueReader.read(decoder, null)); - } - - chunkLength = decoder.mapNext(); - } - - return kvArrayToMap(reusedKeyList, reusedValueList); - } - } - - private static class PlannedStructReader extends ValueReaders.PlannedStructReader { - private final int numFields; - - private PlannedStructReader(List>> readPlan, int numFields) { - super(readPlan); - this.numFields = numFields; - } - - @Override - protected RowData reuseOrCreate(Object reuse) { - if (reuse instanceof GenericRowData && ((GenericRowData) reuse).getArity() == numFields) { - return (RowData) reuse; - } - return new GenericRowData(numFields); - } - - @Override - protected Object get(RowData struct, int pos) { - return null; - } - - @Override - protected void set(RowData struct, int pos, Object value) { - ((GenericRowData) struct).setField(pos, value); - } - } - - private static class StructReader extends ValueReaders.StructReader { - private final int numFields; - - private StructReader( - List> readers, Types.StructType struct, Map idToConstant) { - super(readers, struct, idToConstant); - this.numFields = readers.size(); - } - - @Override - protected RowData reuseOrCreate(Object reuse) { - if (reuse instanceof GenericRowData && ((GenericRowData) reuse).getArity() == numFields) { - return (GenericRowData) reuse; - } - return new GenericRowData(numFields); - } - - @Override - protected Object get(RowData struct, int pos) { - return null; - } - - @Override - protected void set(RowData struct, int pos, Object value) { - ((GenericRowData) struct).setField(pos, value); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java deleted file mode 100644 index f87e63704965..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java +++ /dev/null @@ -1,269 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.io.IOException; -import java.lang.reflect.Array; -import java.util.List; -import org.apache.avro.io.Encoder; -import org.apache.avro.util.Utf8; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.iceberg.avro.ValueWriter; -import org.apache.iceberg.flink.FlinkRowData; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.DecimalUtil; - -public class FlinkValueWriters { - - private FlinkValueWriters() {} - - static ValueWriter strings() { - return StringWriter.INSTANCE; - } - - static ValueWriter timeMicros() { - return TimeMicrosWriter.INSTANCE; - } - - static ValueWriter timestampMicros() { - return TimestampMicrosWriter.INSTANCE; - } - - static ValueWriter timestampNanos() { - return TimestampNanosWriter.INSTANCE; - } - - static ValueWriter decimal(int precision, int scale) { - return new DecimalWriter(precision, scale); - } - - static ValueWriter array(ValueWriter elementWriter, LogicalType elementType) { - return new ArrayWriter<>(elementWriter, elementType); - } - - static ValueWriter arrayMap( - ValueWriter keyWriter, - LogicalType keyType, - ValueWriter valueWriter, - LogicalType valueType) { - return new ArrayMapWriter<>(keyWriter, keyType, valueWriter, valueType); - } - - static ValueWriter map( - ValueWriter keyWriter, - LogicalType keyType, - ValueWriter valueWriter, - LogicalType valueType) { - return new MapWriter<>(keyWriter, keyType, valueWriter, valueType); - } - - static ValueWriter row(List> writers, List types) { - return new RowWriter(writers, types); - } - - private static class StringWriter implements ValueWriter { - private static final StringWriter INSTANCE = new StringWriter(); - - private StringWriter() {} - - @Override - public void write(StringData s, Encoder encoder) throws IOException { - // toBytes is cheaper than Avro calling toString, which incurs encoding costs - encoder.writeString(new Utf8(s.toBytes())); - } - } - - private static class DecimalWriter implements ValueWriter { - private final int precision; - private final int scale; - private final ThreadLocal bytes; - - private DecimalWriter(int precision, int scale) { - this.precision = precision; - this.scale = scale; - this.bytes = - ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); - } - - @Override - public void write(DecimalData d, Encoder encoder) throws IOException { - encoder.writeFixed( - DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toBigDecimal(), bytes.get())); - } - } - - private static class TimeMicrosWriter implements ValueWriter { - private static final TimeMicrosWriter INSTANCE = new TimeMicrosWriter(); - - @Override - public void write(Integer timeMills, Encoder encoder) throws IOException { - encoder.writeLong(timeMills * 1000L); - } - } - - private static class TimestampMicrosWriter implements ValueWriter { - private static final TimestampMicrosWriter INSTANCE = new TimestampMicrosWriter(); - - @Override - public void write(TimestampData timestampData, Encoder encoder) throws IOException { - long micros = - timestampData.getMillisecond() * 1000 + timestampData.getNanoOfMillisecond() / 1000; - encoder.writeLong(micros); - } - } - - private static class TimestampNanosWriter implements ValueWriter { - private static final TimestampNanosWriter INSTANCE = new TimestampNanosWriter(); - - @Override - public void write(TimestampData timestampData, Encoder encoder) throws IOException { - long nanos = - timestampData.getMillisecond() * 1_000_000 + timestampData.getNanoOfMillisecond(); - encoder.writeLong(nanos); - } - } - - private static class ArrayWriter implements ValueWriter { - private final ValueWriter elementWriter; - private final ArrayData.ElementGetter elementGetter; - - private ArrayWriter(ValueWriter elementWriter, LogicalType elementType) { - this.elementWriter = elementWriter; - this.elementGetter = ArrayData.createElementGetter(elementType); - } - - @Override - @SuppressWarnings("unchecked") - public void write(ArrayData array, Encoder encoder) throws IOException { - encoder.writeArrayStart(); - int numElements = array.size(); - encoder.setItemCount(numElements); - for (int i = 0; i < numElements; i += 1) { - encoder.startItem(); - elementWriter.write((T) elementGetter.getElementOrNull(array, i), encoder); - } - encoder.writeArrayEnd(); - } - } - - private static class ArrayMapWriter implements ValueWriter { - private final ValueWriter keyWriter; - private final ValueWriter valueWriter; - private final ArrayData.ElementGetter keyGetter; - private final ArrayData.ElementGetter valueGetter; - - private ArrayMapWriter( - ValueWriter keyWriter, - LogicalType keyType, - ValueWriter valueWriter, - LogicalType valueType) { - this.keyWriter = keyWriter; - this.keyGetter = ArrayData.createElementGetter(keyType); - this.valueWriter = valueWriter; - this.valueGetter = ArrayData.createElementGetter(valueType); - } - - @Override - @SuppressWarnings("unchecked") - public void write(MapData map, Encoder encoder) throws IOException { - encoder.writeArrayStart(); - int numElements = map.size(); - encoder.setItemCount(numElements); - ArrayData keyArray = map.keyArray(); - ArrayData valueArray = map.valueArray(); - for (int i = 0; i < numElements; i += 1) { - encoder.startItem(); - keyWriter.write((K) keyGetter.getElementOrNull(keyArray, i), encoder); - valueWriter.write((V) valueGetter.getElementOrNull(valueArray, i), encoder); - } - encoder.writeArrayEnd(); - } - } - - private static class MapWriter implements ValueWriter { - private final ValueWriter keyWriter; - private final ValueWriter valueWriter; - private final ArrayData.ElementGetter keyGetter; - private final ArrayData.ElementGetter valueGetter; - - private MapWriter( - ValueWriter keyWriter, - LogicalType keyType, - ValueWriter valueWriter, - LogicalType valueType) { - this.keyWriter = keyWriter; - this.keyGetter = ArrayData.createElementGetter(keyType); - this.valueWriter = valueWriter; - this.valueGetter = ArrayData.createElementGetter(valueType); - } - - @Override - @SuppressWarnings("unchecked") - public void write(MapData map, Encoder encoder) throws IOException { - encoder.writeMapStart(); - int numElements = map.size(); - encoder.setItemCount(numElements); - ArrayData keyArray = map.keyArray(); - ArrayData valueArray = map.valueArray(); - for (int i = 0; i < numElements; i += 1) { - encoder.startItem(); - keyWriter.write((K) keyGetter.getElementOrNull(keyArray, i), encoder); - valueWriter.write((V) valueGetter.getElementOrNull(valueArray, i), encoder); - } - encoder.writeMapEnd(); - } - } - - static class RowWriter implements ValueWriter { - private final ValueWriter[] writers; - private final RowData.FieldGetter[] getters; - - private RowWriter(List> writers, List types) { - this.writers = (ValueWriter[]) Array.newInstance(ValueWriter.class, writers.size()); - this.getters = new RowData.FieldGetter[writers.size()]; - for (int i = 0; i < writers.size(); i += 1) { - this.writers[i] = writers.get(i); - this.getters[i] = FlinkRowData.createFieldGetter(types.get(i), i); - } - } - - @Override - public void write(RowData row, Encoder encoder) throws IOException { - for (int i = 0; i < writers.length; i += 1) { - if (row.isNullAt(i)) { - writers[i].write(null, encoder); - } else { - write(row, i, writers[i], encoder); - } - } - } - - @SuppressWarnings("unchecked") - private void write(RowData row, int pos, ValueWriter writer, Encoder encoder) - throws IOException { - writer.write((T) getters[pos].getFieldOrNull(row), encoder); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java deleted file mode 100644 index 6bb2693a0986..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.util.Deque; -import java.util.List; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.LogicalTypeRoot; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.RowType.RowField; -import org.apache.iceberg.avro.AvroSchemaUtil; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.OriginalType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; - -public class ParquetWithFlinkSchemaVisitor { - private final Deque fieldNames = Lists.newLinkedList(); - - public static T visit( - LogicalType sType, Type type, ParquetWithFlinkSchemaVisitor visitor) { - Preconditions.checkArgument(sType != null, "Invalid DataType: null"); - if (type instanceof MessageType) { - Preconditions.checkArgument( - sType instanceof RowType, "Invalid struct: %s is not a struct", sType); - RowType struct = (RowType) sType; - return visitor.message( - struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); - } else if (type.isPrimitive()) { - return visitor.primitive(sType, type.asPrimitiveType()); - } else { - // if not a primitive, the typeId must be a group - GroupType group = type.asGroupType(); - OriginalType annotation = group.getOriginalType(); - if (annotation != null) { - switch (annotation) { - case LIST: - Preconditions.checkArgument( - !group.isRepetition(Type.Repetition.REPEATED), - "Invalid list: top-level group is repeated: %s", - group); - Preconditions.checkArgument( - group.getFieldCount() == 1, - "Invalid list: does not contain single repeated field: %s", - group); - - GroupType repeatedElement = group.getFields().get(0).asGroupType(); - Preconditions.checkArgument( - repeatedElement.isRepetition(Type.Repetition.REPEATED), - "Invalid list: inner group is not repeated"); - Preconditions.checkArgument( - repeatedElement.getFieldCount() <= 1, - "Invalid list: repeated group is not a single field: %s", - group); - - Preconditions.checkArgument( - sType instanceof ArrayType, "Invalid list: %s is not an array", sType); - ArrayType array = (ArrayType) sType; - RowType.RowField element = - new RowField( - "element", array.getElementType(), "element of " + array.asSummaryString()); - - visitor.fieldNames.push(repeatedElement.getName()); - try { - T elementResult = null; - if (repeatedElement.getFieldCount() > 0) { - elementResult = visitField(element, repeatedElement.getType(0), visitor); - } - - return visitor.list(array, group, elementResult); - - } finally { - visitor.fieldNames.pop(); - } - - case MAP: - Preconditions.checkArgument( - !group.isRepetition(Type.Repetition.REPEATED), - "Invalid map: top-level group is repeated: %s", - group); - Preconditions.checkArgument( - group.getFieldCount() == 1, - "Invalid map: does not contain single repeated field: %s", - group); - - GroupType repeatedKeyValue = group.getType(0).asGroupType(); - Preconditions.checkArgument( - repeatedKeyValue.isRepetition(Type.Repetition.REPEATED), - "Invalid map: inner group is not repeated"); - Preconditions.checkArgument( - repeatedKeyValue.getFieldCount() <= 2, - "Invalid map: repeated group does not have 2 fields"); - - Preconditions.checkArgument( - sType instanceof MapType, "Invalid map: %s is not a map", sType); - MapType map = (MapType) sType; - RowField keyField = - new RowField("key", map.getKeyType(), "key of " + map.asSummaryString()); - RowField valueField = - new RowField("value", map.getValueType(), "value of " + map.asSummaryString()); - - visitor.fieldNames.push(repeatedKeyValue.getName()); - try { - T keyResult = null; - T valueResult = null; - switch (repeatedKeyValue.getFieldCount()) { - case 2: - // if there are 2 fields, both key and value are projected - keyResult = visitField(keyField, repeatedKeyValue.getType(0), visitor); - valueResult = visitField(valueField, repeatedKeyValue.getType(1), visitor); - break; - case 1: - // if there is just one, use the name to determine what it is - Type keyOrValue = repeatedKeyValue.getType(0); - if (keyOrValue.getName().equalsIgnoreCase("key")) { - keyResult = visitField(keyField, keyOrValue, visitor); - // value result remains null - } else { - valueResult = visitField(valueField, keyOrValue, visitor); - // key result remains null - } - break; - default: - // both results will remain null - } - - return visitor.map(map, group, keyResult, valueResult); - - } finally { - visitor.fieldNames.pop(); - } - - default: - } - } - Preconditions.checkArgument( - sType instanceof RowType, "Invalid struct: %s is not a struct", sType); - RowType struct = (RowType) sType; - return visitor.struct(struct, group, visitFields(struct, group, visitor)); - } - } - - private static T visitField( - RowType.RowField sField, Type field, ParquetWithFlinkSchemaVisitor visitor) { - visitor.fieldNames.push(field.getName()); - try { - return visit(sField.getType(), field, visitor); - } finally { - visitor.fieldNames.pop(); - } - } - - private static List visitFields( - RowType struct, GroupType group, ParquetWithFlinkSchemaVisitor visitor) { - List sFields = struct.getFields(); - List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); - - int pos = 0; - for (RowField sField : sFields) { - if (sField.getType().getTypeRoot() == LogicalTypeRoot.NULL) { - // skip null types that are not in the Parquet schema - continue; - } - - Type field = group.getFields().get(pos); - Preconditions.checkArgument( - field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.getName())), - "Structs do not match: field %s != %s", - field.getName(), - sField.getName()); - results.add(visitField(sField, field, visitor)); - - pos += 1; - } - - return results; - } - - public T message(RowType sStruct, MessageType message, List fields) { - return null; - } - - public T struct(RowType sStruct, GroupType struct, List fields) { - return null; - } - - public T list(ArrayType sArray, GroupType array, T element) { - return null; - } - - public T map(MapType sMap, GroupType map, T key, T value) { - return null; - } - - public T primitive(LogicalType sPrimitive, PrimitiveType primitive) { - return null; - } - - protected String[] currentPath() { - return Lists.newArrayList(fieldNames.descendingIterator()).toArray(new String[0]); - } - - protected String[] path(String name) { - List list = Lists.newArrayList(fieldNames.descendingIterator()); - list.add(name); - return list.toArray(new String[0]); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java deleted file mode 100644 index 9395b0e4810e..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java +++ /dev/null @@ -1,342 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.util.Arrays; -import java.util.Map; -import java.util.Objects; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RawValueData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.RowKind; -import org.apache.flink.util.StringUtils; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkRowData; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; - -public class RowDataProjection implements RowData { - /** - * Creates a projecting wrapper for {@link RowData} rows. - * - *

    This projection will not project the nested children types of repeated types like lists and - * maps. - * - * @param schema schema of rows wrapped by this projection - * @param projectedSchema result schema of the projected rows - * @return a wrapper to project rows - */ - public static RowDataProjection create(Schema schema, Schema projectedSchema) { - return RowDataProjection.create( - FlinkSchemaUtil.convert(schema), schema.asStruct(), projectedSchema.asStruct()); - } - - /** - * Creates a projecting wrapper for {@link RowData} rows. - * - *

    This projection will not project the nested children types of repeated types like lists and - * maps. - * - * @param rowType flink row type of rows wrapped by this projection - * @param schema schema of rows wrapped by this projection - * @param projectedSchema result schema of the projected rows - * @return a wrapper to project rows - */ - public static RowDataProjection create( - RowType rowType, Types.StructType schema, Types.StructType projectedSchema) { - return new RowDataProjection(rowType, schema, projectedSchema); - } - - private final RowData.FieldGetter[] getters; - private RowData rowData; - - private RowDataProjection( - RowType rowType, Types.StructType rowStruct, Types.StructType projectType) { - Map fieldIdToPosition = Maps.newHashMap(); - for (int i = 0; i < rowStruct.fields().size(); i++) { - fieldIdToPosition.put(rowStruct.fields().get(i).fieldId(), i); - } - - this.getters = new RowData.FieldGetter[projectType.fields().size()]; - for (int i = 0; i < getters.length; i++) { - Types.NestedField projectField = projectType.fields().get(i); - Types.NestedField rowField = rowStruct.field(projectField.fieldId()); - - Preconditions.checkNotNull( - rowField, - "Cannot locate the project field <%s> in the iceberg struct <%s>", - projectField, - rowStruct); - - getters[i] = - createFieldGetter( - rowType, fieldIdToPosition.get(projectField.fieldId()), rowField, projectField); - } - } - - private static RowData.FieldGetter createFieldGetter( - RowType rowType, int position, Types.NestedField rowField, Types.NestedField projectField) { - Preconditions.checkArgument( - rowField.type().typeId() == projectField.type().typeId(), - "Different iceberg type between row field <%s> and project field <%s>", - rowField, - projectField); - - switch (projectField.type().typeId()) { - case STRUCT: - RowType nestedRowType = (RowType) rowType.getTypeAt(position); - return row -> { - // null nested struct value - if (row.isNullAt(position)) { - return null; - } - - RowData nestedRow = row.getRow(position, nestedRowType.getFieldCount()); - return RowDataProjection.create( - nestedRowType, rowField.type().asStructType(), projectField.type().asStructType()) - .wrap(nestedRow); - }; - - case MAP: - Types.MapType projectedMap = projectField.type().asMapType(); - Types.MapType originalMap = rowField.type().asMapType(); - - boolean keyProjectable = - !projectedMap.keyType().isNestedType() - || projectedMap.keyType().equals(originalMap.keyType()); - boolean valueProjectable = - !projectedMap.valueType().isNestedType() - || projectedMap.valueType().equals(originalMap.valueType()); - Preconditions.checkArgument( - keyProjectable && valueProjectable, - "Cannot project a partial map key or value with non-primitive type. Trying to project <%s> out of <%s>", - projectField, - rowField); - - return FlinkRowData.createFieldGetter(rowType.getTypeAt(position), position); - - case LIST: - Types.ListType projectedList = projectField.type().asListType(); - Types.ListType originalList = rowField.type().asListType(); - - boolean elementProjectable = - !projectedList.elementType().isNestedType() - || projectedList.elementType().equals(originalList.elementType()); - Preconditions.checkArgument( - elementProjectable, - "Cannot project a partial list element with non-primitive type. Trying to project <%s> out of <%s>", - projectField, - rowField); - - return FlinkRowData.createFieldGetter(rowType.getTypeAt(position), position); - - default: - return FlinkRowData.createFieldGetter(rowType.getTypeAt(position), position); - } - } - - public RowData wrap(RowData row) { - // StructProjection allow wrapping null root struct object. - // See more discussions in https://github.com/apache/iceberg/pull/7517. - // RowDataProjection never allowed null root object to be wrapped. - // Hence, it is fine to enforce strict Preconditions check here. - Preconditions.checkArgument(row != null, "Invalid row data: null"); - this.rowData = row; - return this; - } - - private Object getValue(int pos) { - Preconditions.checkState(rowData != null, "Row data not wrapped"); - return getters[pos].getFieldOrNull(rowData); - } - - @Override - public int getArity() { - return getters.length; - } - - @Override - public RowKind getRowKind() { - Preconditions.checkState(rowData != null, "Row data not wrapped"); - return rowData.getRowKind(); - } - - @Override - public void setRowKind(RowKind kind) { - throw new UnsupportedOperationException("Cannot set row kind in the RowDataProjection"); - } - - @Override - public boolean isNullAt(int pos) { - return getValue(pos) == null; - } - - @Override - public boolean getBoolean(int pos) { - return (boolean) getValue(pos); - } - - @Override - public byte getByte(int pos) { - return (byte) getValue(pos); - } - - @Override - public short getShort(int pos) { - return (short) getValue(pos); - } - - @Override - public int getInt(int pos) { - return (int) getValue(pos); - } - - @Override - public long getLong(int pos) { - return (long) getValue(pos); - } - - @Override - public float getFloat(int pos) { - return (float) getValue(pos); - } - - @Override - public double getDouble(int pos) { - return (double) getValue(pos); - } - - @Override - public StringData getString(int pos) { - return (StringData) getValue(pos); - } - - @Override - public DecimalData getDecimal(int pos, int precision, int scale) { - return (DecimalData) getValue(pos); - } - - @Override - public TimestampData getTimestamp(int pos, int precision) { - return (TimestampData) getValue(pos); - } - - @Override - @SuppressWarnings("unchecked") - public RawValueData getRawValue(int pos) { - return (RawValueData) getValue(pos); - } - - @Override - public byte[] getBinary(int pos) { - return (byte[]) getValue(pos); - } - - @Override - public ArrayData getArray(int pos) { - return (ArrayData) getValue(pos); - } - - @Override - public MapData getMap(int pos) { - return (MapData) getValue(pos); - } - - @Override - public RowData getRow(int pos, int numFields) { - return (RowData) getValue(pos); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (!(o instanceof RowDataProjection)) { - return false; - } - - RowDataProjection that = (RowDataProjection) o; - return deepEquals(that); - } - - @Override - public int hashCode() { - int result = Objects.hashCode(getRowKind()); - for (int pos = 0; pos < getArity(); pos++) { - if (!isNullAt(pos)) { - // Arrays.deepHashCode handles array object properly - result = 31 * result + Arrays.deepHashCode(new Object[] {getValue(pos)}); - } - } - - return result; - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append(getRowKind().shortString()).append("("); - for (int pos = 0; pos < getArity(); pos++) { - if (pos != 0) { - sb.append(","); - } - // copied the behavior from Flink GenericRowData - sb.append(StringUtils.arrayAwareToString(getValue(pos))); - } - - sb.append(")"); - return sb.toString(); - } - - private boolean deepEquals(RowDataProjection other) { - if (getRowKind() != other.getRowKind()) { - return false; - } - - if (getArity() != other.getArity()) { - return false; - } - - for (int pos = 0; pos < getArity(); ++pos) { - if (isNullAt(pos) && other.isNullAt(pos)) { - continue; - } - - if ((isNullAt(pos) && !other.isNullAt(pos)) || (!isNullAt(pos) && other.isNullAt(pos))) { - return false; - } - - // Objects.deepEquals handles array object properly - if (!Objects.deepEquals(getValue(pos), other.getValue(pos))) { - return false; - } - } - - return true; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java deleted file mode 100644 index f23a7ee3d0d3..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.util.UUID; -import org.apache.avro.generic.GenericData; -import org.apache.avro.util.Utf8; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.runtime.typeutils.RowDataSerializer; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.ByteBuffers; -import org.apache.iceberg.util.DateTimeUtil; -import org.apache.iceberg.util.UUIDUtil; - -public class RowDataUtil { - - private RowDataUtil() {} - - public static Object convertConstant(Type type, Object value) { - if (value == null) { - return null; - } - - switch (type.typeId()) { - case DECIMAL: // DecimalData - Types.DecimalType decimal = (Types.DecimalType) type; - return DecimalData.fromBigDecimal((BigDecimal) value, decimal.precision(), decimal.scale()); - case STRING: // StringData - if (value instanceof Utf8) { - Utf8 utf8 = (Utf8) value; - return StringData.fromBytes(utf8.getBytes(), 0, utf8.getByteLength()); - } - return StringData.fromString(value.toString()); - case FIXED: // byte[] - if (value instanceof byte[]) { - return value; - } else if (value instanceof GenericData.Fixed) { - return ((GenericData.Fixed) value).bytes(); - } - return ByteBuffers.toByteArray((ByteBuffer) value); - case BINARY: // byte[] - return ByteBuffers.toByteArray((ByteBuffer) value); - case TIME: // int mills instead of long - return (int) ((Long) value / 1000); - case TIMESTAMP: // TimestampData - return TimestampData.fromLocalDateTime(DateTimeUtil.timestampFromMicros((Long) value)); - case UUID: - return UUIDUtil.convert((UUID) value); - default: - } - return value; - } - - /** - * Similar to the private {@link RowDataSerializer#copyRowData(RowData, RowData)} method. This - * skips the check the arity of rowType and from, because the from RowData may contains additional - * column for position deletes. Using {@link RowDataSerializer#copy(RowData, RowData)} will fail - * the arity check. - */ - public static RowData clone( - RowData from, - RowData reuse, - RowType rowType, - TypeSerializer[] fieldSerializers, - RowData.FieldGetter[] fieldGetters) { - GenericRowData ret; - if (reuse instanceof GenericRowData) { - ret = (GenericRowData) reuse; - } else { - ret = new GenericRowData(from.getArity()); - } - - ret.setRowKind(from.getRowKind()); - for (int i = 0; i < rowType.getFieldCount(); i++) { - if (!from.isNullAt(i)) { - ret.setField(i, fieldSerializers[i].copy(fieldGetters[i].getFieldOrNull(from))); - } else { - ret.setField(i, null); - } - } - - return ret; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java deleted file mode 100644 index 34576a1e5c0b..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java +++ /dev/null @@ -1,300 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.time.Duration; -import java.time.Instant; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.time.OffsetDateTime; -import java.time.ZoneOffset; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.UUID; -import org.apache.flink.annotation.Internal; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RawValueData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.ByteBuffers; - -@Internal -public class StructRowData implements RowData { - private final Types.StructType type; - private RowKind kind; - private StructLike struct; - - public StructRowData(Types.StructType type) { - this(type, RowKind.INSERT); - } - - public StructRowData(Types.StructType type, RowKind kind) { - this(type, null, kind); - } - - private StructRowData(Types.StructType type, StructLike struct) { - this(type, struct, RowKind.INSERT); - } - - private StructRowData(Types.StructType type, StructLike struct, RowKind kind) { - this.type = type; - this.struct = struct; - this.kind = kind; - } - - public StructRowData setStruct(StructLike newStruct) { - this.struct = newStruct; - return this; - } - - @Override - public int getArity() { - return struct.size(); - } - - @Override - public RowKind getRowKind() { - return kind; - } - - @Override - public void setRowKind(RowKind newKind) { - Preconditions.checkNotNull(newKind, "kind can not be null"); - this.kind = newKind; - } - - @Override - public boolean isNullAt(int pos) { - return struct.get(pos, Object.class) == null; - } - - @Override - public boolean getBoolean(int pos) { - return struct.get(pos, Boolean.class); - } - - @Override - public byte getByte(int pos) { - return (byte) (int) struct.get(pos, Integer.class); - } - - @Override - public short getShort(int pos) { - return (short) (int) struct.get(pos, Integer.class); - } - - @Override - public int getInt(int pos) { - Object integer = struct.get(pos, Object.class); - - if (integer instanceof Integer) { - return (int) integer; - } else if (integer instanceof LocalDate) { - return (int) ((LocalDate) integer).toEpochDay(); - } else if (integer instanceof LocalTime) { - return (int) (((LocalTime) integer).toNanoOfDay() / 1000_000); - } else { - throw new IllegalStateException( - "Unknown type for int field. Type name: " + integer.getClass().getName()); - } - } - - @Override - public long getLong(int pos) { - Object longVal = struct.get(pos, Object.class); - - if (longVal instanceof Long) { - return (long) longVal; - } else if (longVal instanceof OffsetDateTime) { - return Duration.between(Instant.EPOCH, (OffsetDateTime) longVal).toNanos() / 1000; - } else if (longVal instanceof LocalDate) { - return ((LocalDate) longVal).toEpochDay(); - } else if (longVal instanceof LocalTime) { - return ((LocalTime) longVal).toNanoOfDay(); - } else if (longVal instanceof LocalDateTime) { - return Duration.between(Instant.EPOCH, ((LocalDateTime) longVal).atOffset(ZoneOffset.UTC)) - .toNanos() - / 1000; - } else { - throw new IllegalStateException( - "Unknown type for long field. Type name: " + longVal.getClass().getName()); - } - } - - @Override - public float getFloat(int pos) { - return struct.get(pos, Float.class); - } - - @Override - public double getDouble(int pos) { - return struct.get(pos, Double.class); - } - - @Override - public StringData getString(int pos) { - return isNullAt(pos) ? null : getStringDataInternal(pos); - } - - private StringData getStringDataInternal(int pos) { - CharSequence seq = struct.get(pos, CharSequence.class); - return StringData.fromString(seq.toString()); - } - - @Override - public DecimalData getDecimal(int pos, int precision, int scale) { - return isNullAt(pos) - ? null - : DecimalData.fromBigDecimal(getDecimalInternal(pos), precision, scale); - } - - private BigDecimal getDecimalInternal(int pos) { - return struct.get(pos, BigDecimal.class); - } - - @Override - public TimestampData getTimestamp(int pos, int precision) { - long timeLong = getLong(pos); - return TimestampData.fromEpochMillis(timeLong / 1000, (int) (timeLong % 1000) * 1000); - } - - @Override - public RawValueData getRawValue(int pos) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public byte[] getBinary(int pos) { - return isNullAt(pos) ? null : getBinaryInternal(pos); - } - - private byte[] getBinaryInternal(int pos) { - Object bytes = struct.get(pos, Object.class); - - // should only be either ByteBuffer or byte[] - if (bytes instanceof ByteBuffer) { - return ByteBuffers.toByteArray((ByteBuffer) bytes); - } else if (bytes instanceof byte[]) { - return (byte[]) bytes; - } else if (bytes instanceof UUID) { - UUID uuid = (UUID) bytes; - ByteBuffer bb = ByteBuffer.allocate(16); - bb.putLong(uuid.getMostSignificantBits()); - bb.putLong(uuid.getLeastSignificantBits()); - return bb.array(); - } else { - throw new IllegalStateException( - "Unknown type for binary field. Type name: " + bytes.getClass().getName()); - } - } - - @Override - public ArrayData getArray(int pos) { - return isNullAt(pos) - ? null - : (ArrayData) - convertValue(type.fields().get(pos).type().asListType(), struct.get(pos, List.class)); - } - - @Override - public MapData getMap(int pos) { - return isNullAt(pos) - ? null - : (MapData) - convertValue(type.fields().get(pos).type().asMapType(), struct.get(pos, Map.class)); - } - - @Override - public RowData getRow(int pos, int numFields) { - return isNullAt(pos) ? null : getStructRowData(pos); - } - - private StructRowData getStructRowData(int pos) { - return new StructRowData( - type.fields().get(pos).type().asStructType(), struct.get(pos, StructLike.class)); - } - - private Object convertValue(Type elementType, Object value) { - switch (elementType.typeId()) { - case BOOLEAN: - case INTEGER: - case DATE: - case TIME: - case LONG: - case FLOAT: - case DOUBLE: - case DECIMAL: - return value; - case TIMESTAMP: - long millisecond = (long) value / 1000; - int nanoOfMillisecond = (int) ((Long) value % 1000) * 1000; - return TimestampData.fromEpochMillis(millisecond, nanoOfMillisecond); - case STRING: - return StringData.fromString(value.toString()); - case FIXED: - case BINARY: - return ByteBuffers.toByteArray((ByteBuffer) value); - case STRUCT: - return new StructRowData(elementType.asStructType(), (StructLike) value); - case LIST: - List list = (List) value; - Object[] array = new Object[list.size()]; - - int index = 0; - for (Object element : list) { - if (element == null) { - array[index] = null; - } else { - array[index] = convertValue(elementType.asListType().elementType(), element); - } - - index += 1; - } - return new GenericArrayData(array); - case MAP: - Types.MapType mapType = elementType.asMapType(); - Set> entries = ((Map) value).entrySet(); - Map result = Maps.newHashMap(); - for (Map.Entry entry : entries) { - final Object keyValue = convertValue(mapType.keyType(), entry.getKey()); - final Object valueValue = convertValue(mapType.valueType(), entry.getValue()); - result.put(keyValue, valueValue); - } - - return new GenericMapData(result); - default: - throw new UnsupportedOperationException("Unsupported element type: " + elementType); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/DeleteOrphanFiles.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/DeleteOrphanFiles.java deleted file mode 100644 index 2fce5e0b3e80..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/DeleteOrphanFiles.java +++ /dev/null @@ -1,324 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import java.time.Duration; -import java.util.Map; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; -import org.apache.flink.util.OutputTag; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.MetadataTableType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.actions.DeleteOrphanFiles.PrefixMismatchMode; -import org.apache.iceberg.flink.maintenance.operator.DeleteFilesProcessor; -import org.apache.iceberg.flink.maintenance.operator.FileNameReader; -import org.apache.iceberg.flink.maintenance.operator.FileUriKeySelector; -import org.apache.iceberg.flink.maintenance.operator.ListFileSystemFiles; -import org.apache.iceberg.flink.maintenance.operator.ListMetadataFiles; -import org.apache.iceberg.flink.maintenance.operator.MetadataTablePlanner; -import org.apache.iceberg.flink.maintenance.operator.OrphanFilesDetector; -import org.apache.iceberg.flink.maintenance.operator.SkipOnError; -import org.apache.iceberg.flink.maintenance.operator.TaskResultAggregator; -import org.apache.iceberg.flink.source.ScanContext; -import org.apache.iceberg.relocated.com.google.common.base.Splitter; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.ThreadPools; - -/** Delete orphan files from the file system. */ -public class DeleteOrphanFiles { - - private static final Schema FILE_PATH_SCHEMA = new Schema(DataFile.FILE_PATH); - private static final ScanContext FILE_PATH_SCAN_CONTEXT = - ScanContext.builder().streaming(true).project(FILE_PATH_SCHEMA).build(); - private static final Splitter COMMA_SPLITTER = Splitter.on(","); - - @Internal - public static final OutputTag ERROR_STREAM = - new OutputTag<>("error-stream", TypeInformation.of(Exception.class)); - - static final String PLANNER_TASK_NAME = "Table Planner"; - static final String READER_TASK_NAME = "Files Reader"; - static final String FILESYSTEM_FILES_TASK_NAME = "Filesystem Files"; - static final String METADATA_FILES_TASK_NAME = "List metadata Files"; - static final String DELETE_FILES_TASK_NAME = "Delete File"; - static final String AGGREGATOR_TASK_NAME = "Orphan Files Aggregator"; - static final String FILTER_FILES_TASK_NAME = "Filter File"; - static final String SKIP_ON_ERROR_TASK_NAME = "Skip On Error"; - - public static DeleteOrphanFiles.Builder builder() { - return new DeleteOrphanFiles.Builder(); - } - - private DeleteOrphanFiles() { - // Do not instantiate directly - } - - public static class Builder extends MaintenanceTaskBuilder { - private String location; - private Duration minAge = Duration.ofDays(3); - private int planningWorkerPoolSize = ThreadPools.WORKER_THREAD_POOL_SIZE; - private int deleteBatchSize = 1000; - private boolean usePrefixListing = false; - private Map equalSchemes = - Maps.newHashMap( - ImmutableMap.of( - "s3n", "s3", - "s3a", "s3")); - private final Map equalAuthorities = Maps.newHashMap(); - private PrefixMismatchMode prefixMismatchMode = PrefixMismatchMode.ERROR; - - @Override - String maintenanceTaskName() { - return "DeleteOrphanFiles"; - } - - /** - * The location to start the recursive listing the candidate files for removal. By default, the - * {@link Table#location()} is used. - * - * @param newLocation the task will scan - * @return for chained calls - */ - public Builder location(String newLocation) { - this.location = newLocation; - return this; - } - - /** - * Whether to use prefix listing when listing files from the file system. - * - * @param newUsePrefixListing true to enable prefix listing, false otherwise - * @return for chained calls - */ - public Builder usePrefixListing(boolean newUsePrefixListing) { - this.usePrefixListing = newUsePrefixListing; - return this; - } - - /** - * Action behavior when location prefixes (schemes/authorities) mismatch. - * - * @param newPrefixMismatchMode to action when mismatch - * @return for chained calls - */ - public Builder prefixMismatchMode(PrefixMismatchMode newPrefixMismatchMode) { - this.prefixMismatchMode = newPrefixMismatchMode; - return this; - } - - /** - * The files newer than this age will not be removed. - * - * @param newMinAge of the files to be removed - * @return for chained calls - */ - public Builder minAge(Duration newMinAge) { - this.minAge = newMinAge; - return this; - } - - /** - * The worker pool size used for planning the scan of the {@link MetadataTableType#ALL_FILES} - * table. This scan is used for determining the files used by the table. - * - * @param newPlanningWorkerPoolSize for scanning - * @return for chained calls - */ - public Builder planningWorkerPoolSize(int newPlanningWorkerPoolSize) { - this.planningWorkerPoolSize = newPlanningWorkerPoolSize; - return this; - } - - /** - * Passes schemes that should be considered equal. - * - *

    The key may include a comma-separated list of schemes. For instance, - * Map("s3a,s3,s3n","s3"). - * - * @param newEqualSchemes list of equal schemes - * @return this for method chaining - */ - public Builder equalSchemes(Map newEqualSchemes) { - equalSchemes.putAll(flattenMap(newEqualSchemes)); - return this; - } - - /** - * Passes authorities that should be considered equal. - * - *

    The key may include a comma-separate list of authorities. For instance, - * Map("s1name,s2name","servicename"). - * - * @param newEqualAuthorities list of equal authorities - * @return this for method chaining - */ - public Builder equalAuthorities(Map newEqualAuthorities) { - equalAuthorities.putAll(flattenMap(newEqualAuthorities)); - return this; - } - - /** - * Size of the batch used to deleting the files. - * - * @param newDeleteBatchSize number of batch file - * @return for chained calls - */ - public Builder deleteBatchSize(int newDeleteBatchSize) { - this.deleteBatchSize = newDeleteBatchSize; - return this; - } - - @Override - DataStream append(DataStream trigger) { - tableLoader().open(); - - // Collect all data files - SingleOutputStreamOperator splits = - trigger - .process( - new MetadataTablePlanner( - taskName(), - index(), - tableLoader(), - FILE_PATH_SCAN_CONTEXT, - MetadataTableType.ALL_FILES, - planningWorkerPoolSize)) - .name(operatorName(PLANNER_TASK_NAME)) - .uid(PLANNER_TASK_NAME + uidSuffix()) - .slotSharingGroup(slotSharingGroup()) - .forceNonParallel(); - - // Read the records and get all data files - SingleOutputStreamOperator tableDataFiles = - splits - .rebalance() - .process( - new FileNameReader( - taskName(), - index(), - tableLoader(), - FILE_PATH_SCHEMA, - FILE_PATH_SCAN_CONTEXT, - MetadataTableType.ALL_FILES)) - .name(operatorName(READER_TASK_NAME)) - .uid(READER_TASK_NAME + uidSuffix()) - .slotSharingGroup(slotSharingGroup()) - .setParallelism(parallelism()); - - // Collect all meta data files - SingleOutputStreamOperator tableMetadataFiles = - trigger - .process(new ListMetadataFiles(taskName(), index(), tableLoader())) - .name(operatorName(METADATA_FILES_TASK_NAME)) - .uid(METADATA_FILES_TASK_NAME + uidSuffix()) - .slotSharingGroup(slotSharingGroup()) - .forceNonParallel(); - - // List the all file system files - SingleOutputStreamOperator allFsFiles = - trigger - .process( - new ListFileSystemFiles( - taskName(), - index(), - tableLoader(), - location, - minAge.toMillis(), - usePrefixListing)) - .name(operatorName(FILESYSTEM_FILES_TASK_NAME)) - .uid(FILESYSTEM_FILES_TASK_NAME + uidSuffix()) - .slotSharingGroup(slotSharingGroup()) - .forceNonParallel(); - - SingleOutputStreamOperator filesToDelete = - tableMetadataFiles - .union(tableDataFiles) - .keyBy(new FileUriKeySelector(equalSchemes, equalAuthorities)) - .connect(allFsFiles.keyBy(new FileUriKeySelector(equalSchemes, equalAuthorities))) - .process(new OrphanFilesDetector(prefixMismatchMode, equalSchemes, equalAuthorities)) - .slotSharingGroup(slotSharingGroup()) - .name(operatorName(FILTER_FILES_TASK_NAME)) - .uid(FILTER_FILES_TASK_NAME + uidSuffix()) - .setParallelism(parallelism()); - - DataStream errorStream = - tableMetadataFiles - .getSideOutput(ERROR_STREAM) - .union( - allFsFiles.getSideOutput(ERROR_STREAM), - tableDataFiles.getSideOutput(ERROR_STREAM), - splits.getSideOutput(ERROR_STREAM), - filesToDelete.getSideOutput(ERROR_STREAM)); - - // Stop deleting the files if there is an error - SingleOutputStreamOperator filesOrSkip = - filesToDelete - .connect(errorStream) - .transform( - operatorName(SKIP_ON_ERROR_TASK_NAME), - TypeInformation.of(String.class), - new SkipOnError()) - .uid(SKIP_ON_ERROR_TASK_NAME + uidSuffix()) - .slotSharingGroup(slotSharingGroup()) - .forceNonParallel(); - - // delete the files - filesOrSkip - .rebalance() - .transform( - operatorName(DELETE_FILES_TASK_NAME), - TypeInformation.of(Void.class), - new DeleteFilesProcessor( - tableLoader().loadTable(), taskName(), index(), deleteBatchSize)) - .uid(DELETE_FILES_TASK_NAME + uidSuffix()) - .slotSharingGroup(slotSharingGroup()) - .setParallelism(parallelism()); - - // Ignore the file deletion result and return the DataStream directly - return trigger - .connect(errorStream) - .transform( - operatorName(AGGREGATOR_TASK_NAME), - TypeInformation.of(TaskResult.class), - new TaskResultAggregator(tableName(), taskName(), index())) - .uid(AGGREGATOR_TASK_NAME + uidSuffix()) - .slotSharingGroup(slotSharingGroup()) - .forceNonParallel(); - } - } - - private static Map flattenMap(Map map) { - Map flattenedMap = Maps.newHashMap(); - if (map != null) { - for (String key : map.keySet()) { - String value = map.get(key); - for (String splitKey : COMMA_SPLITTER.split(key)) { - flattenedMap.put(splitKey.trim(), value.trim()); - } - } - } - - return flattenedMap; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ExpireSnapshots.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ExpireSnapshots.java deleted file mode 100644 index 628a91141474..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ExpireSnapshots.java +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import java.time.Duration; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.flink.maintenance.operator.DeleteFilesProcessor; -import org.apache.iceberg.flink.maintenance.operator.ExpireSnapshotsProcessor; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** Deletes expired snapshots and the corresponding files. */ -public class ExpireSnapshots { - private static final int DELETE_BATCH_SIZE_DEFAULT = 1000; - private static final String EXECUTOR_OPERATOR_NAME = "Expire Snapshot"; - @VisibleForTesting static final String DELETE_FILES_OPERATOR_NAME = "Delete file"; - - private ExpireSnapshots() {} - - /** Creates the builder for creating a stream which expires snapshots for the table. */ - public static Builder builder() { - return new Builder(); - } - - public static class Builder extends MaintenanceTaskBuilder { - private Duration maxSnapshotAge = null; - private Integer numSnapshots = null; - private Integer planningWorkerPoolSize; - private int deleteBatchSize = DELETE_BATCH_SIZE_DEFAULT; - private Boolean cleanExpiredMetadata = null; - - @Override - String maintenanceTaskName() { - return "ExpireSnapshots"; - } - - /** - * The snapshots older than this age will be removed. - * - * @param newMaxSnapshotAge of the snapshots to be removed - */ - public Builder maxSnapshotAge(Duration newMaxSnapshotAge) { - this.maxSnapshotAge = newMaxSnapshotAge; - return this; - } - - /** - * The minimum number of {@link Snapshot}s to retain. For more details description see {@link - * org.apache.iceberg.ExpireSnapshots#retainLast(int)}. - * - * @param newNumSnapshots number of snapshots to retain - */ - public Builder retainLast(int newNumSnapshots) { - this.numSnapshots = newNumSnapshots; - return this; - } - - /** - * The worker pool size used to calculate the files to delete. If not set, the shared worker - * pool is used. - * - * @param newPlanningWorkerPoolSize for planning files to delete - */ - public Builder planningWorkerPoolSize(int newPlanningWorkerPoolSize) { - this.planningWorkerPoolSize = newPlanningWorkerPoolSize; - return this; - } - - /** - * Size of the batch used to deleting the files. - * - * @param newDeleteBatchSize used for deleting - */ - public Builder deleteBatchSize(int newDeleteBatchSize) { - this.deleteBatchSize = newDeleteBatchSize; - return this; - } - - /** - * Expires unused table metadata such as partition specs and schemas. - * - * @param newCleanExpiredMetadata remove unused partition specs, schemas, or other metadata when - * true - * @return this for method chaining - */ - public Builder cleanExpiredMetadata(boolean newCleanExpiredMetadata) { - this.cleanExpiredMetadata = newCleanExpiredMetadata; - return this; - } - - @Override - DataStream append(DataStream trigger) { - Preconditions.checkNotNull(tableLoader(), "TableLoader should not be null"); - - SingleOutputStreamOperator result = - trigger - .process( - new ExpireSnapshotsProcessor( - tableLoader(), - maxSnapshotAge == null ? null : maxSnapshotAge.toMillis(), - numSnapshots, - planningWorkerPoolSize, - cleanExpiredMetadata)) - .name(operatorName(EXECUTOR_OPERATOR_NAME)) - .uid(EXECUTOR_OPERATOR_NAME + uidSuffix()) - .slotSharingGroup(slotSharingGroup()) - .forceNonParallel(); - - result - .getSideOutput(ExpireSnapshotsProcessor.DELETE_STREAM) - .rebalance() - .transform( - operatorName(DELETE_FILES_OPERATOR_NAME), - TypeInformation.of(Void.class), - new DeleteFilesProcessor( - tableLoader().loadTable(), taskName(), index(), deleteBatchSize)) - .uid(DELETE_FILES_OPERATOR_NAME + uidSuffix()) - .slotSharingGroup(slotSharingGroup()) - .setParallelism(parallelism()); - - // Ignore the file deletion result and return the DataStream directly - return result; - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/FlinkMaintenanceConfig.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/FlinkMaintenanceConfig.java deleted file mode 100644 index 0c88abf82099..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/FlinkMaintenanceConfig.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import java.util.Map; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.streaming.api.graph.StreamGraphGenerator; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.FlinkConfParser; - -public class FlinkMaintenanceConfig { - - public static final String PREFIX = "flink-maintenance."; - - public static final String LOCK_CHECK_DELAY = PREFIX + "lock-check-delay-seconds"; - public static final ConfigOption LOCK_CHECK_DELAY_OPTION = - ConfigOptions.key(LOCK_CHECK_DELAY) - .longType() - .defaultValue(TableMaintenance.LOCK_CHECK_DELAY_SECOND_DEFAULT) - .withDescription( - "The delay time (in seconds) between each lock check during maintenance operations such as " - + "rewriting data files, manifest files, expiring snapshots, and deleting orphan files."); - - public static final String PARALLELISM = PREFIX + "parallelism"; - public static final ConfigOption PARALLELISM_OPTION = - ConfigOptions.key(PARALLELISM) - .intType() - .defaultValue(ExecutionConfig.PARALLELISM_DEFAULT) - .withDescription("The number of parallel tasks for the maintenance action."); - - public static final String RATE_LIMIT = PREFIX + "rate-limit-seconds"; - public static final ConfigOption RATE_LIMIT_OPTION = - ConfigOptions.key(RATE_LIMIT) - .longType() - .defaultValue(TableMaintenance.RATE_LIMIT_SECOND_DEFAULT) - .withDescription( - "The rate limit (in seconds) for maintenance operations. " - + "This controls how many operations can be performed per second."); - - public static final String SLOT_SHARING_GROUP = PREFIX + "slot-sharing-group"; - public static final ConfigOption SLOT_SHARING_GROUP_OPTION = - ConfigOptions.key(SLOT_SHARING_GROUP) - .stringType() - .defaultValue(StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP) - .withDescription( - "The slot sharing group for maintenance tasks. " - + "Determines which operators can share slots in the Flink execution environment."); - - private final FlinkConfParser confParser; - private final Table table; - private final Map writeProperties; - private final ReadableConfig readableConfig; - - public FlinkMaintenanceConfig( - Table table, Map writeOptions, ReadableConfig readableConfig) { - this.table = table; - this.readableConfig = readableConfig; - this.writeProperties = writeOptions; - this.confParser = new FlinkConfParser(table, writeOptions, readableConfig); - } - - /** Gets the rate limit value (in seconds) for maintenance operations. */ - public long rateLimit() { - return confParser - .longConf() - .option(RATE_LIMIT) - .flinkConfig(RATE_LIMIT_OPTION) - .defaultValue(RATE_LIMIT_OPTION.defaultValue()) - .parse(); - } - - /** Gets the parallelism value for maintenance tasks. */ - public int parallelism() { - return confParser - .intConf() - .option(PARALLELISM) - .flinkConfig(PARALLELISM_OPTION) - .defaultValue(PARALLELISM_OPTION.defaultValue()) - .parse(); - } - - /** Gets the lock check delay value (in seconds). */ - public long lockCheckDelay() { - return confParser - .longConf() - .option(LOCK_CHECK_DELAY) - .flinkConfig(LOCK_CHECK_DELAY_OPTION) - .defaultValue(LOCK_CHECK_DELAY_OPTION.defaultValue()) - .parse(); - } - - /** Gets the slot sharing group value for maintenance tasks. */ - public String slotSharingGroup() { - return confParser - .stringConf() - .option(SLOT_SHARING_GROUP) - .flinkConfig(SLOT_SHARING_GROUP_OPTION) - .defaultValue(SLOT_SHARING_GROUP_OPTION.defaultValue()) - .parse(); - } - - public RewriteDataFilesConfig createRewriteDataFilesConfig() { - return new RewriteDataFilesConfig(table, writeProperties, readableConfig); - } - - public LockConfig createLockConfig() { - return new LockConfig(table, writeProperties, readableConfig); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java deleted file mode 100644 index f68605accc57..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java +++ /dev/null @@ -1,325 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import java.io.IOException; -import java.sql.DatabaseMetaData; -import java.sql.PreparedStatement; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.sql.SQLNonTransientConnectionException; -import java.sql.SQLTimeoutException; -import java.sql.SQLTransientConnectionException; -import java.util.Map; -import java.util.UUID; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.jdbc.JdbcClientPool; -import org.apache.iceberg.jdbc.UncheckedInterruptedException; -import org.apache.iceberg.jdbc.UncheckedSQLException; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.util.PropertyUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** JDBC table backed implementation of the {@link TriggerLockFactory}. */ -public class JdbcLockFactory implements TriggerLockFactory { - private static final Logger LOG = LoggerFactory.getLogger(JdbcLockFactory.class); - - @Internal - public static final String INIT_LOCK_TABLES_PROPERTY = - "flink-maintenance.lock.jdbc.init-lock-tables"; - - private static final String LOCK_TABLE_NAME = "flink_maintenance_lock"; - private static final int LOCK_ID_MAX_LENGTH = 100; - private static final String CREATE_LOCK_TABLE_SQL = - String.format( - "CREATE TABLE %s " - + "(LOCK_TYPE CHAR(1) NOT NULL, " - + "LOCK_ID VARCHAR(%s) NOT NULL, " - + "INSTANCE_ID CHAR(36) NOT NULL, PRIMARY KEY (LOCK_TYPE, LOCK_ID))", - LOCK_TABLE_NAME, LOCK_ID_MAX_LENGTH); - - private static final String CREATE_LOCK_SQL = - String.format( - "INSERT INTO %s (LOCK_TYPE, LOCK_ID, INSTANCE_ID) VALUES (?, ?, ?)", LOCK_TABLE_NAME); - private static final String GET_LOCK_SQL = - String.format("SELECT INSTANCE_ID FROM %s WHERE LOCK_TYPE=? AND LOCK_ID=?", LOCK_TABLE_NAME); - private static final String DELETE_LOCK_SQL = - String.format( - "DELETE FROM %s WHERE LOCK_TYPE=? AND LOCK_ID=? AND INSTANCE_ID=?", LOCK_TABLE_NAME); - - private final String uri; - private final String lockId; - private final Map properties; - private transient JdbcClientPool pool; - - /** - * Creates a new {@link TriggerLockFactory}. The lockId should be unique between the users of the - * same uri. - * - * @param uri of the jdbc connection - * @param lockId which should identify the job and the table - * @param properties used for creating the jdbc connection pool - */ - public JdbcLockFactory(String uri, String lockId, Map properties) { - Preconditions.checkNotNull(uri, "JDBC connection URI is required"); - Preconditions.checkNotNull(properties, "Properties map is required"); - Preconditions.checkArgument( - lockId.length() < LOCK_ID_MAX_LENGTH, - "Invalid prefix length: lockId should be shorter than %s", - LOCK_ID_MAX_LENGTH); - this.uri = uri; - this.lockId = lockId; - this.properties = properties; - } - - @Override - public void open() { - this.pool = new JdbcClientPool(1, uri, properties); - - if (PropertyUtil.propertyAsBoolean(properties, INIT_LOCK_TABLES_PROPERTY, false)) { - initializeLockTables(); - } - } - - /** Only used in testing to share the jdbc pool */ - @VisibleForTesting - void open(JdbcLockFactory other) { - this.pool = other.pool; - } - - @Override - public Lock createLock() { - return new JdbcLock(pool, lockId, Type.MAINTENANCE); - } - - @Override - public Lock createRecoveryLock() { - return new JdbcLock(pool, lockId, Type.RECOVERY); - } - - @Override - public void close() throws IOException { - pool.close(); - } - - private void initializeLockTables() { - LOG.debug("Creating database tables (if missing) to store table maintenance locks"); - try { - pool.run( - conn -> { - DatabaseMetaData dbMeta = conn.getMetaData(); - try (ResultSet rs = - dbMeta.getTables( - null /* catalog name */, - null /* schemaPattern */, - LOCK_TABLE_NAME /* tableNamePattern */, - null /* types */)) { - if (rs.next()) { - LOG.debug("Flink maintenance lock table already exists"); - return true; - } - } - LOG.info("Creating Flink maintenance lock table {}", LOCK_TABLE_NAME); - try (PreparedStatement ps = conn.prepareStatement(CREATE_LOCK_TABLE_SQL)) { - ps.execute(); - } - - return true; - }); - } catch (SQLTimeoutException e) { - throw new UncheckedSQLException( - e, "Cannot initialize JDBC table maintenance lock: Query timed out"); - } catch (SQLTransientConnectionException | SQLNonTransientConnectionException e) { - throw new UncheckedSQLException( - e, "Cannot initialize JDBC table maintenance lock: Connection failed"); - } catch (SQLException e) { - throw new UncheckedSQLException(e, "Cannot initialize JDBC table maintenance lock"); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new UncheckedInterruptedException(e, "Interrupted in call to initialize"); - } - } - - private static class JdbcLock implements TriggerLockFactory.Lock { - private final JdbcClientPool pool; - private final String lockId; - private final Type type; - - private JdbcLock(JdbcClientPool pool, String lockId, Type type) { - this.pool = pool; - this.lockId = lockId; - this.type = type; - } - - @Override - public boolean tryLock() { - if (isHeld()) { - LOG.info("Lock is already held for {}", this); - return false; - } - - String newInstanceId = UUID.randomUUID().toString(); - try { - return pool.run( - conn -> { - try (PreparedStatement sql = conn.prepareStatement(CREATE_LOCK_SQL)) { - sql.setString(1, type.key); - sql.setString(2, lockId); - sql.setString(3, newInstanceId); - int count = sql.executeUpdate(); - LOG.info( - "Created {} lock with instanceId {} with row count {}", - this, - newInstanceId, - count); - return count == 1; - } - }); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new UncheckedInterruptedException(e, "Interrupted during tryLock"); - } catch (SQLException e) { - // SQL exception happened when creating the lock. Check if the lock creation was - // successful behind the scenes. - if (newInstanceId.equals(instanceId())) { - return true; - } else { - throw new UncheckedSQLException(e, "Failed to create %s lock", this); - } - } - } - - @SuppressWarnings("checkstyle:NestedTryDepth") - @Override - public boolean isHeld() { - try { - return pool.run( - conn -> { - try (PreparedStatement sql = conn.prepareStatement(GET_LOCK_SQL)) { - sql.setString(1, type.key); - sql.setString(2, lockId); - try (ResultSet rs = sql.executeQuery()) { - return rs.next(); - } - } - }); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new UncheckedInterruptedException(e, "Interrupted during isHeld"); - } catch (SQLException e) { - // SQL exception happened when getting lock information - throw new UncheckedSQLException(e, "Failed to check the state of the lock %s", this); - } - } - - @SuppressWarnings("checkstyle:NestedTryDepth") - @Override - public void unlock() { - try { - // Possible concurrency issue: - // - `unlock` and `tryLock` happens at the same time when there is an existing lock - // - // Steps: - // 1. `unlock` removes the lock in the database, but there is a temporary connection failure - // 2. `lock` finds that there is no lock, so creates a new lock - // 3. `unlock` retries the lock removal and removes the new lock - // - // To prevent the situation above we fetch the current lockId, and remove the lock - // only with the given id. - String instanceId = instanceId(); - - if (instanceId != null) { - pool.run( - conn -> { - try (PreparedStatement sql = conn.prepareStatement(DELETE_LOCK_SQL)) { - sql.setString(1, type.key); - sql.setString(2, lockId); - sql.setString(3, instanceId); - long count = sql.executeUpdate(); - LOG.info( - "Deleted {} lock with instanceId {} with row count {}", - this, - instanceId, - count); - } catch (SQLException e) { - // SQL exception happened when deleting lock information - throw new UncheckedSQLException( - e, "Failed to delete %s lock with instanceId %s", this, instanceId); - } - - return null; - }); - } - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new UncheckedInterruptedException(e, "Interrupted during unlock"); - } catch (SQLException e) { - // SQL exception happened when getting/updating lock information - throw new UncheckedSQLException(e, "Failed to remove lock %s", this); - } - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this).add("type", type).add("lockId", lockId).toString(); - } - - @SuppressWarnings("checkstyle:NestedTryDepth") - private String instanceId() { - try { - return pool.run( - conn -> { - try (PreparedStatement sql = conn.prepareStatement(GET_LOCK_SQL)) { - sql.setString(1, type.key); - sql.setString(2, lockId); - try (ResultSet rs = sql.executeQuery()) { - if (rs.next()) { - return rs.getString(1); - } else { - return null; - } - } - } catch (SQLException e) { - // SQL exception happened when getting lock information - throw new UncheckedSQLException(e, "Failed to get lock information for %s", type); - } - }); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new UncheckedInterruptedException(e, "Interrupted during unlock"); - } catch (SQLException e) { - throw new UncheckedSQLException(e, "Failed to get lock information for %s", type); - } - } - } - - private enum Type { - MAINTENANCE("m"), - RECOVERY("r"); - - private final String key; - - Type(String key) { - this.key = key; - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/LockConfig.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/LockConfig.java deleted file mode 100644 index b28731f91c15..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/LockConfig.java +++ /dev/null @@ -1,218 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import java.util.Map; -import java.util.stream.Collectors; -import org.apache.commons.lang3.StringUtils; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.FlinkConfParser; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -public class LockConfig { - - public static final String PREFIX = FlinkMaintenanceConfig.PREFIX + "lock."; - - public static final ConfigOption LOCK_TYPE_OPTION = - ConfigOptions.key(PREFIX + "type") - .stringType() - .defaultValue(StringUtils.EMPTY) - .withDescription("The type of lock to use, e.g., jdbc or zookeeper."); - - public static final ConfigOption LOCK_ID_OPTION = - ConfigOptions.key(PREFIX + "lock-id") - .stringType() - .defaultValue(StringUtils.EMPTY) - .withDescription("The unique identifier for the lock."); - - public static class JdbcLockConfig { - - public static final String JDBC = "jdbc"; - - public static final ConfigOption JDBC_URI_OPTION = - ConfigOptions.key(PREFIX + JDBC + ".uri") - .stringType() - .defaultValue(StringUtils.EMPTY) - .withDescription("The URI of the JDBC connection for acquiring the lock."); - - public static final ConfigOption JDBC_INIT_LOCK_TABLE_OPTION = - ConfigOptions.key(PREFIX + JDBC + ".init-lock-table") - .stringType() - .defaultValue(Boolean.FALSE.toString()) - .withDescription("Whether to initialize the lock table in the JDBC database."); - } - - public static class ZkLockConfig { - public static final String ZK = "zookeeper"; - - public static final ConfigOption ZK_URI_OPTION = - ConfigOptions.key(PREFIX + ZK + ".uri") - .stringType() - .defaultValue(StringUtils.EMPTY) - .withDescription("The URI of the Zookeeper service for acquiring the lock."); - - public static final ConfigOption ZK_SESSION_TIMEOUT_MS_OPTION = - ConfigOptions.key(PREFIX + ZK + ".session-timeout-ms") - .intType() - .defaultValue(60000) - .withDescription("The session timeout (in milliseconds) for the Zookeeper client."); - - public static final ConfigOption ZK_CONNECTION_TIMEOUT_MS_OPTION = - ConfigOptions.key(PREFIX + ZK + ".connection-timeout-ms") - .intType() - .defaultValue(15000) - .withDescription("The connection timeout (in milliseconds) for the Zookeeper client."); - - public static final ConfigOption ZK_BASE_SLEEP_MS_OPTION = - ConfigOptions.key(PREFIX + ZK + ".base-sleep-ms") - .intType() - .defaultValue(3000) - .withDescription( - "The base sleep time (in milliseconds) between retries for the Zookeeper client."); - - public static final ConfigOption ZK_MAX_RETRIES_OPTION = - ConfigOptions.key(PREFIX + ZK + ".max-retries") - .intType() - .defaultValue(3) - .withDescription("The maximum number of retries for the Zookeeper client."); - } - - private final FlinkConfParser confParser; - private final Map writeProperties; - private final Map setProperties; - - public LockConfig(Table table, Map writeOptions, ReadableConfig readableConfig) { - this.writeProperties = writeOptions; - this.setProperties = readableConfig.toMap(); - this.confParser = new FlinkConfParser(table, writeOptions, readableConfig); - } - - /** Gets the lock type configuration value (e.g., jdbc or zookeeper). */ - public String lockType() { - return confParser - .stringConf() - .option(LOCK_TYPE_OPTION.key()) - .flinkConfig(LOCK_TYPE_OPTION) - .defaultValue(LOCK_TYPE_OPTION.defaultValue()) - .parse(); - } - - /** Gets the lock ID configuration value. If blank, returns the provided default value. */ - public String lockId(String defaultValue) { - String lockId = - confParser - .stringConf() - .option(LOCK_ID_OPTION.key()) - .flinkConfig(LOCK_ID_OPTION) - .defaultValue(LOCK_ID_OPTION.defaultValue()) - .parse(); - if (StringUtils.isBlank(lockId)) { - return defaultValue; - } - - return lockId; - } - - /** Gets the JDBC URI configuration value. */ - public String jdbcUri() { - return confParser - .stringConf() - .option(JdbcLockConfig.JDBC_URI_OPTION.key()) - .flinkConfig(JdbcLockConfig.JDBC_URI_OPTION) - .defaultValue(JdbcLockConfig.JDBC_URI_OPTION.defaultValue()) - .parse(); - } - - /** Gets the configuration value for initializing the JDBC lock table. */ - public String jdbcInitTable() { - return confParser - .stringConf() - .option(JdbcLockConfig.JDBC_INIT_LOCK_TABLE_OPTION.key()) - .flinkConfig(JdbcLockConfig.JDBC_INIT_LOCK_TABLE_OPTION) - .defaultValue(JdbcLockConfig.JDBC_INIT_LOCK_TABLE_OPTION.defaultValue()) - .parse(); - } - - /** Gets the Zookeeper URI configuration value. */ - public String zkUri() { - return confParser - .stringConf() - .option(ZkLockConfig.ZK_URI_OPTION.key()) - .flinkConfig(ZkLockConfig.ZK_URI_OPTION) - .defaultValue(ZkLockConfig.ZK_URI_OPTION.defaultValue()) - .parse(); - } - - /** Gets the Zookeeper session timeout configuration (in milliseconds). */ - public int zkSessionTimeoutMs() { - return confParser - .intConf() - .option(ZkLockConfig.ZK_SESSION_TIMEOUT_MS_OPTION.key()) - .flinkConfig(ZkLockConfig.ZK_SESSION_TIMEOUT_MS_OPTION) - .defaultValue(ZkLockConfig.ZK_SESSION_TIMEOUT_MS_OPTION.defaultValue()) - .parse(); - } - - /** Gets the Zookeeper connection timeout configuration (in milliseconds). */ - public int zkConnectionTimeoutMs() { - return confParser - .intConf() - .option(ZkLockConfig.ZK_CONNECTION_TIMEOUT_MS_OPTION.key()) - .flinkConfig(ZkLockConfig.ZK_CONNECTION_TIMEOUT_MS_OPTION) - .defaultValue(ZkLockConfig.ZK_CONNECTION_TIMEOUT_MS_OPTION.defaultValue()) - .parse(); - } - - /** Gets the Zookeeper base sleep time configuration (in milliseconds). */ - public int zkBaseSleepMs() { - return confParser - .intConf() - .option(ZkLockConfig.ZK_BASE_SLEEP_MS_OPTION.key()) - .flinkConfig(ZkLockConfig.ZK_BASE_SLEEP_MS_OPTION) - .defaultValue(ZkLockConfig.ZK_BASE_SLEEP_MS_OPTION.defaultValue()) - .parse(); - } - - /** Gets the Zookeeper maximum retry count configuration. */ - public int zkMaxRetries() { - return confParser - .intConf() - .option(ZkLockConfig.ZK_MAX_RETRIES_OPTION.key()) - .flinkConfig(ZkLockConfig.ZK_MAX_RETRIES_OPTION) - .defaultValue(ZkLockConfig.ZK_MAX_RETRIES_OPTION.defaultValue()) - .parse(); - } - - public Map properties() { - Map mergeConfig = Maps.newHashMap(); - mergeConfig.putAll(setProperties); - mergeConfig.putAll(writeProperties); - return mergeConfig.entrySet().stream() - .filter(entry -> entry.getKey().startsWith(PREFIX)) - .collect( - Collectors.toMap( - entry -> entry.getKey().substring(PREFIX.length()), - Map.Entry::getValue, - (existing, replacement) -> existing, - Maps::newHashMap)); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskBuilder.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskBuilder.java deleted file mode 100644 index 5d5f17b0a80e..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskBuilder.java +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import java.time.Duration; -import org.apache.flink.annotation.Experimental; -import org.apache.flink.api.common.operators.util.OperatorValidationUtils; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.maintenance.operator.TriggerEvaluator; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -@Experimental -@SuppressWarnings("unchecked") -public abstract class MaintenanceTaskBuilder> { - private int index; - private String taskName; - private String tableName; - private TableLoader tableLoader; - private String uidSuffix = null; - private String slotSharingGroup = null; - private Integer parallelism = null; - private final TriggerEvaluator.Builder triggerEvaluator = new TriggerEvaluator.Builder(); - - abstract DataStream append(DataStream sourceStream); - - abstract String maintenanceTaskName(); - - /** - * After a given number of Iceberg table commits since the last run, starts the downstream job. - * - * @param commitCount after the downstream job should be started - */ - public T scheduleOnCommitCount(int commitCount) { - triggerEvaluator.commitCount(commitCount); - return (T) this; - } - - /** - * After a given number of new data files since the last run, starts the downstream job. - * - * @param dataFileCount after the downstream job should be started - */ - public T scheduleOnDataFileCount(int dataFileCount) { - triggerEvaluator.dataFileCount(dataFileCount); - return (T) this; - } - - /** - * After a given aggregated data file size since the last run, starts the downstream job. - * - * @param dataFileSizeInBytes after the downstream job should be started - */ - public T scheduleOnDataFileSize(long dataFileSizeInBytes) { - triggerEvaluator.dataFileSizeInBytes(dataFileSizeInBytes); - return (T) this; - } - - /** - * After a given number of new positional delete files since the last run, starts the downstream - * job. - * - * @param posDeleteFileCount after the downstream job should be started - */ - public T scheduleOnPosDeleteFileCount(int posDeleteFileCount) { - triggerEvaluator.posDeleteFileCount(posDeleteFileCount); - return (T) this; - } - - /** - * After a given number of new positional delete records since the last run, starts the downstream - * job. - * - * @param posDeleteRecordCount after the downstream job should be started - */ - public T scheduleOnPosDeleteRecordCount(long posDeleteRecordCount) { - triggerEvaluator.posDeleteRecordCount(posDeleteRecordCount); - return (T) this; - } - - /** - * After a given number of new equality delete files since the last run, starts the downstream - * job. - * - * @param eqDeleteFileCount after the downstream job should be started - */ - public T scheduleOnEqDeleteFileCount(int eqDeleteFileCount) { - triggerEvaluator.eqDeleteFileCount(eqDeleteFileCount); - return (T) this; - } - - /** - * After a given number of new equality delete records since the last run, starts the downstream - * job. - * - * @param eqDeleteRecordCount after the downstream job should be started - */ - public T scheduleOnEqDeleteRecordCount(long eqDeleteRecordCount) { - triggerEvaluator.eqDeleteRecordCount(eqDeleteRecordCount); - return (T) this; - } - - /** - * After a given time since the last run, starts the downstream job. - * - * @param interval after the downstream job should be started - */ - public T scheduleOnInterval(Duration interval) { - triggerEvaluator.timeout(interval); - return (T) this; - } - - /** - * The suffix used for the generated {@link org.apache.flink.api.dag.Transformation}'s uid. - * - * @param newUidSuffix for the transformations - */ - public T uidSuffix(String newUidSuffix) { - this.uidSuffix = newUidSuffix; - return (T) this; - } - - /** - * The {@link SingleOutputStreamOperator#slotSharingGroup(String)} for all the operators of the - * generated stream. Could be used to separate the resources used by this task. - * - * @param newSlotSharingGroup to be used for the operators - */ - public T slotSharingGroup(String newSlotSharingGroup) { - this.slotSharingGroup = newSlotSharingGroup; - return (T) this; - } - - /** - * Sets the parallelism for the stream. - * - * @param newParallelism the required parallelism - */ - public T parallelism(int newParallelism) { - OperatorValidationUtils.validateParallelism(newParallelism); - this.parallelism = newParallelism; - return (T) this; - } - - protected int index() { - return index; - } - - protected String taskName() { - return taskName; - } - - protected String tableName() { - return tableName; - } - - protected TableLoader tableLoader() { - return tableLoader; - } - - protected String uidSuffix() { - return uidSuffix; - } - - protected String slotSharingGroup() { - return slotSharingGroup; - } - - protected Integer parallelism() { - return parallelism; - } - - protected String operatorName(String operatorNameBase) { - return operatorNameBase + "[" + index() + "]"; - } - - TriggerEvaluator evaluator() { - return triggerEvaluator.build(); - } - - DataStream append( - DataStream sourceStream, - String newTableName, - String newTaskName, - int taskIndex, - TableLoader newTableLoader, - String defaultUidSuffix, - String defaultSlotSharingGroup, - int defaultParallelism) { - Preconditions.checkNotNull(newTaskName, "Task name should not be null"); - Preconditions.checkNotNull(newTableLoader, "TableLoader should not be null"); - - this.index = taskIndex; - this.taskName = newTaskName; - this.tableName = newTableName; - this.tableLoader = newTableLoader; - - if (uidSuffix == null) { - uidSuffix = this.taskName + "_" + index + "_" + defaultUidSuffix; - } - - if (parallelism == null) { - parallelism = defaultParallelism; - } - - if (slotSharingGroup == null) { - slotSharingGroup = defaultSlotSharingGroup; - } - - return append(sourceStream); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java deleted file mode 100644 index bedf70725a63..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java +++ /dev/null @@ -1,294 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import java.time.Duration; -import java.util.Map; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.iceberg.actions.BinPackRewriteFilePlanner; -import org.apache.iceberg.actions.SizeBasedFileRewritePlanner; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.maintenance.operator.DataFileRewriteCommitter; -import org.apache.iceberg.flink.maintenance.operator.DataFileRewritePlanner; -import org.apache.iceberg.flink.maintenance.operator.DataFileRewriteRunner; -import org.apache.iceberg.flink.maintenance.operator.TaskResultAggregator; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -/** - * Creates the data file rewriter data stream. Which runs a single iteration of the task for every - * {@link Trigger} event. - */ -public class RewriteDataFiles { - static final String PLANNER_TASK_NAME = "RDF Planner"; - static final String REWRITE_TASK_NAME = "Rewrite"; - static final String COMMIT_TASK_NAME = "Rewrite commit"; - static final String AGGREGATOR_TASK_NAME = "Rewrite aggregator"; - - private RewriteDataFiles() {} - - /** Creates the builder for a stream which rewrites data files for the table. */ - public static Builder builder() { - return new Builder(); - } - - public static class Builder extends MaintenanceTaskBuilder { - private boolean partialProgressEnabled = - org.apache.iceberg.actions.RewriteDataFiles.PARTIAL_PROGRESS_ENABLED_DEFAULT; - private int partialProgressMaxCommits = - org.apache.iceberg.actions.RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS_DEFAULT; - private final Map rewriteOptions = Maps.newHashMapWithExpectedSize(6); - private long maxRewriteBytes = Long.MAX_VALUE; - private Expression filter = Expressions.alwaysTrue(); - - @Override - String maintenanceTaskName() { - return "RewriteDataFiles"; - } - - /** - * Allows committing compacted data files in batches. See {@link - * org.apache.iceberg.actions.RewriteDataFiles#PARTIAL_PROGRESS_ENABLED} for more details. - * - * @param newPartialProgressEnabled to enable partial commits - */ - public Builder partialProgressEnabled(boolean newPartialProgressEnabled) { - this.partialProgressEnabled = newPartialProgressEnabled; - return this; - } - - /** - * Configures the size of batches if {@link #partialProgressEnabled}. See {@link - * org.apache.iceberg.actions.RewriteDataFiles#PARTIAL_PROGRESS_MAX_COMMITS} for more details. - * - * @param newPartialProgressMaxCommits to target number of the commits per run - */ - public Builder partialProgressMaxCommits(int newPartialProgressMaxCommits) { - this.partialProgressMaxCommits = newPartialProgressMaxCommits; - return this; - } - - /** - * Configures the maximum byte size of the rewrites for one scheduled compaction. This could be - * used to limit the resources used by the compaction. - * - * @param newMaxRewriteBytes to limit the size of the rewrites - */ - public Builder maxRewriteBytes(long newMaxRewriteBytes) { - this.maxRewriteBytes = newMaxRewriteBytes; - return this; - } - - /** - * Configures the target file size. See {@link - * org.apache.iceberg.actions.RewriteDataFiles#TARGET_FILE_SIZE_BYTES} for more details. - * - * @param targetFileSizeBytes target file size - */ - public Builder targetFileSizeBytes(long targetFileSizeBytes) { - this.rewriteOptions.put( - SizeBasedFileRewritePlanner.TARGET_FILE_SIZE_BYTES, String.valueOf(targetFileSizeBytes)); - return this; - } - - /** - * Configures the min file size considered for rewriting. See {@link - * SizeBasedFileRewritePlanner#MIN_FILE_SIZE_BYTES} for more details. - * - * @param minFileSizeBytes min file size - */ - public Builder minFileSizeBytes(long minFileSizeBytes) { - this.rewriteOptions.put( - SizeBasedFileRewritePlanner.MIN_FILE_SIZE_BYTES, String.valueOf(minFileSizeBytes)); - return this; - } - - /** - * Configures the max file size considered for rewriting. See {@link - * SizeBasedFileRewritePlanner#MAX_FILE_SIZE_BYTES} for more details. - * - * @param maxFileSizeBytes max file size - */ - public Builder maxFileSizeBytes(long maxFileSizeBytes) { - this.rewriteOptions.put( - SizeBasedFileRewritePlanner.MAX_FILE_SIZE_BYTES, String.valueOf(maxFileSizeBytes)); - return this; - } - - /** - * Configures the minimum file number after a rewrite is always initiated. See description see - * {@link SizeBasedFileRewritePlanner#MIN_INPUT_FILES} for more details. - * - * @param minInputFiles min file number - */ - public Builder minInputFiles(int minInputFiles) { - this.rewriteOptions.put( - SizeBasedFileRewritePlanner.MIN_INPUT_FILES, String.valueOf(minInputFiles)); - return this; - } - - /** - * Configures the minimum delete file number for a file after a rewrite is always initiated. See - * {@link BinPackRewriteFilePlanner#DELETE_FILE_THRESHOLD} for more details. - * - * @param deleteFileThreshold min delete file number - */ - public Builder deleteFileThreshold(int deleteFileThreshold) { - this.rewriteOptions.put( - BinPackRewriteFilePlanner.DELETE_FILE_THRESHOLD, String.valueOf(deleteFileThreshold)); - return this; - } - - /** - * Overrides other options and forces rewriting of all provided files. - * - * @param rewriteAll enables a full rewrite - */ - public Builder rewriteAll(boolean rewriteAll) { - this.rewriteOptions.put(SizeBasedFileRewritePlanner.REWRITE_ALL, String.valueOf(rewriteAll)); - return this; - } - - /** - * Configures the group size for rewriting. See {@link - * SizeBasedFileRewritePlanner#MAX_FILE_GROUP_SIZE_BYTES} for more details. - * - * @param maxFileGroupSizeBytes file group size for rewrite - */ - public Builder maxFileGroupSizeBytes(long maxFileGroupSizeBytes) { - this.rewriteOptions.put( - SizeBasedFileRewritePlanner.MAX_FILE_GROUP_SIZE_BYTES, - String.valueOf(maxFileGroupSizeBytes)); - return this; - } - - /** - * Configures max files to rewrite. See {@link BinPackRewriteFilePlanner#MAX_FILES_TO_REWRITE} - * for more details. - * - * @param maxFilesToRewrite maximum files to rewrite - */ - public Builder maxFilesToRewrite(int maxFilesToRewrite) { - this.rewriteOptions.put( - BinPackRewriteFilePlanner.MAX_FILES_TO_REWRITE, String.valueOf(maxFilesToRewrite)); - return this; - } - - /** - * A user provided filter for determining which files will be considered by the rewrite - * strategy. - * - * @param newFilter the filter expression to apply - * @return this for method chaining - */ - public Builder filter(Expression newFilter) { - this.filter = newFilter; - return this; - } - - /** - * Configures the properties for the rewriter. - * - * @param rewriteDataFilesConfig properties for the rewriter - */ - public Builder config(RewriteDataFilesConfig rewriteDataFilesConfig) { - - // Config about the rewriter - this.partialProgressEnabled(rewriteDataFilesConfig.partialProgressEnable()) - .partialProgressMaxCommits(rewriteDataFilesConfig.partialProgressMaxCommits()) - .maxRewriteBytes(rewriteDataFilesConfig.maxRewriteBytes()) - // Config about the schedule - .scheduleOnCommitCount(rewriteDataFilesConfig.scheduleOnCommitCount()) - .scheduleOnDataFileCount(rewriteDataFilesConfig.scheduleOnDataFileCount()) - .scheduleOnDataFileSize(rewriteDataFilesConfig.scheduleOnDataFileSize()) - .scheduleOnInterval( - Duration.ofSeconds(rewriteDataFilesConfig.scheduleOnIntervalSecond())); - - // override the rewrite options - this.rewriteOptions.putAll(rewriteDataFilesConfig.properties()); - - return this; - } - - /** - * The input is a {@link DataStream} with {@link Trigger} events and every event should be - * immediately followed by a {@link Watermark} with the same timestamp as the event. - * - *

    The output is a {@link DataStream} with the {@link TaskResult} of the run followed by the - * {@link Watermark}. - */ - @Override - DataStream append(DataStream trigger) { - SingleOutputStreamOperator planned = - trigger - .process( - new DataFileRewritePlanner( - tableName(), - taskName(), - index(), - tableLoader(), - partialProgressEnabled ? partialProgressMaxCommits : 1, - maxRewriteBytes, - rewriteOptions, - filter)) - .name(operatorName(PLANNER_TASK_NAME)) - .uid(PLANNER_TASK_NAME + uidSuffix()) - .slotSharingGroup(slotSharingGroup()) - .forceNonParallel(); - - SingleOutputStreamOperator rewritten = - planned - .rebalance() - .process(new DataFileRewriteRunner(tableName(), taskName(), index())) - .name(operatorName(REWRITE_TASK_NAME)) - .uid(REWRITE_TASK_NAME + uidSuffix()) - .slotSharingGroup(slotSharingGroup()) - .setParallelism(parallelism()); - - SingleOutputStreamOperator updated = - rewritten - .transform( - operatorName(COMMIT_TASK_NAME), - TypeInformation.of(Trigger.class), - new DataFileRewriteCommitter(tableName(), taskName(), index(), tableLoader())) - .uid(COMMIT_TASK_NAME + uidSuffix()) - .slotSharingGroup(slotSharingGroup()) - .forceNonParallel(); - - return trigger - .union(updated) - .connect( - planned - .getSideOutput(TaskResultAggregator.ERROR_STREAM) - .union( - rewritten.getSideOutput(TaskResultAggregator.ERROR_STREAM), - updated.getSideOutput(TaskResultAggregator.ERROR_STREAM))) - .transform( - operatorName(AGGREGATOR_TASK_NAME), - TypeInformation.of(TaskResult.class), - new TaskResultAggregator(tableName(), taskName(), index())) - .uid(AGGREGATOR_TASK_NAME + uidSuffix()) - .slotSharingGroup(slotSharingGroup()) - .forceNonParallel(); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFilesConfig.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFilesConfig.java deleted file mode 100644 index b2fb83b75b86..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFilesConfig.java +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import java.util.Map; -import java.util.stream.Collectors; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.iceberg.Table; -import org.apache.iceberg.actions.RewriteDataFiles; -import org.apache.iceberg.flink.FlinkConfParser; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -public class RewriteDataFilesConfig { - public static final String PREFIX = FlinkMaintenanceConfig.PREFIX + "rewrite."; - - public static final String MAX_BYTES = PREFIX + "max-bytes"; - public static final ConfigOption MAX_BYTES_OPTION = - ConfigOptions.key(MAX_BYTES) - .longType() - .defaultValue(Long.MAX_VALUE) - .withDescription( - "The maximum number of bytes allowed for a rewrite operation. " - + "If the total size of data files exceeds this limit, the rewrites within one scheduled compaction " - + "will be limited in size to restrict the resources used by the compaction."); - - public static final ConfigOption PARTIAL_PROGRESS_MAX_COMMITS_OPTION = - ConfigOptions.key(PREFIX + RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS) - .intType() - .defaultValue(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS_DEFAULT) - .withDescription( - "The maximum number of commits allowed when partial progress is enabled. " - + "This configuration controls how many file groups " - + "are committed per run when partial progress is enabled."); - - public static final ConfigOption PARTIAL_PROGRESS_ENABLED_OPTION = - ConfigOptions.key(PREFIX + RewriteDataFiles.PARTIAL_PROGRESS_ENABLED) - .booleanType() - .defaultValue(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED_DEFAULT) - .withDescription( - "Whether to enable partial progress commits. " - + "When enabled, the rewrite operation will commit by file group, " - + "allowing progress even if some file groups fail to commit."); - - public static final String SCHEDULE_ON_COMMIT_COUNT = PREFIX + "schedule.commit-count"; - public static final ConfigOption SCHEDULE_ON_COMMIT_COUNT_OPTION = - ConfigOptions.key(SCHEDULE_ON_COMMIT_COUNT) - .intType() - .defaultValue(10) - .withDescription( - "The number of commits after which to trigger a new rewrite operation. " - + "This setting controls the frequency of rewrite operations."); - - public static final String SCHEDULE_ON_DATA_FILE_COUNT = PREFIX + "schedule.data-file-count"; - public static final ConfigOption SCHEDULE_ON_DATA_FILE_COUNT_OPTION = - ConfigOptions.key(SCHEDULE_ON_DATA_FILE_COUNT) - .intType() - .defaultValue(1000) - .withDescription("The number of data files that should trigger a new rewrite operation."); - - public static final String SCHEDULE_ON_DATA_FILE_SIZE = PREFIX + "schedule.data-file-size"; - public static final ConfigOption SCHEDULE_ON_DATA_FILE_SIZE_OPTION = - ConfigOptions.key(SCHEDULE_ON_DATA_FILE_SIZE) - .longType() - .defaultValue(100L * 1024 * 1024 * 1024) // Default is 100 GB - .withDescription( - "The total size of data files that should trigger a new rewrite operation."); - - public static final String SCHEDULE_ON_INTERVAL_SECOND = PREFIX + "schedule.interval-second"; - public static final ConfigOption SCHEDULE_ON_INTERVAL_SECOND_OPTION = - ConfigOptions.key(SCHEDULE_ON_INTERVAL_SECOND) - .longType() - .defaultValue(10 * 60L) // Default is 10 minutes - .withDescription( - "The time interval (in seconds) between two consecutive rewrite operations. " - + "This ensures periodic scheduling of rewrite tasks."); - - private final FlinkConfParser confParser; - private final Map writeProperties; - - public RewriteDataFilesConfig( - Table table, Map writeOptions, ReadableConfig readableConfig) { - this.writeProperties = writeOptions; - this.confParser = new FlinkConfParser(table, writeOptions, readableConfig); - } - - /** Gets the number of commits that trigger a rewrite operation. */ - public int scheduleOnCommitCount() { - return confParser - .intConf() - .option(SCHEDULE_ON_COMMIT_COUNT) - .flinkConfig(SCHEDULE_ON_COMMIT_COUNT_OPTION) - .defaultValue(SCHEDULE_ON_COMMIT_COUNT_OPTION.defaultValue()) - .parse(); - } - - /** Gets the number of data files that trigger a rewrite operation. */ - public int scheduleOnDataFileCount() { - return confParser - .intConf() - .option(SCHEDULE_ON_DATA_FILE_COUNT) - .flinkConfig(SCHEDULE_ON_DATA_FILE_COUNT_OPTION) - .defaultValue(SCHEDULE_ON_DATA_FILE_COUNT_OPTION.defaultValue()) - .parse(); - } - - /** Gets the total size of data files that trigger a rewrite operation. */ - public long scheduleOnDataFileSize() { - return confParser - .longConf() - .option(SCHEDULE_ON_DATA_FILE_SIZE) - .flinkConfig(SCHEDULE_ON_DATA_FILE_SIZE_OPTION) - .defaultValue(SCHEDULE_ON_DATA_FILE_SIZE_OPTION.defaultValue()) - .parse(); - } - - /** Gets the time interval (in seconds) between two consecutive rewrite operations. */ - public long scheduleOnIntervalSecond() { - return confParser - .longConf() - .option(SCHEDULE_ON_INTERVAL_SECOND) - .flinkConfig(SCHEDULE_ON_INTERVAL_SECOND_OPTION) - .defaultValue(SCHEDULE_ON_INTERVAL_SECOND_OPTION.defaultValue()) - .parse(); - } - - /** Gets whether partial progress commits are enabled. */ - public boolean partialProgressEnable() { - return confParser - .booleanConf() - .option(PARTIAL_PROGRESS_ENABLED_OPTION.key()) - .flinkConfig(PARTIAL_PROGRESS_ENABLED_OPTION) - .defaultValue(PARTIAL_PROGRESS_ENABLED_OPTION.defaultValue()) - .parse(); - } - - /** Gets the maximum number of commits allowed for partial progress. */ - public int partialProgressMaxCommits() { - return confParser - .intConf() - .option(PARTIAL_PROGRESS_MAX_COMMITS_OPTION.key()) - .flinkConfig(PARTIAL_PROGRESS_MAX_COMMITS_OPTION) - .defaultValue(PARTIAL_PROGRESS_MAX_COMMITS_OPTION.defaultValue()) - .parse(); - } - - /** Gets the maximum rewrite bytes allowed for a single rewrite operation. */ - public long maxRewriteBytes() { - return confParser - .longConf() - .option(MAX_BYTES) - .flinkConfig(MAX_BYTES_OPTION) - .defaultValue(MAX_BYTES_OPTION.defaultValue()) - .parse(); - } - - public Map properties() { - return writeProperties.entrySet().stream() - .filter(entry -> entry.getKey().startsWith(PREFIX)) - .collect( - Collectors.toMap( - entry -> entry.getKey().substring(PREFIX.length()), - Map.Entry::getValue, - (existing, replacement) -> existing, - Maps::newHashMap)); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TableMaintenance.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TableMaintenance.java deleted file mode 100644 index 1a2b0607dd1e..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TableMaintenance.java +++ /dev/null @@ -1,336 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import java.io.IOException; -import java.time.Duration; -import java.util.List; -import java.util.Locale; -import java.util.UUID; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.common.eventtime.TimestampAssigner; -import org.apache.flink.api.common.eventtime.TimestampAssignerSupplier; -import org.apache.flink.api.common.eventtime.Watermark; -import org.apache.flink.api.common.eventtime.WatermarkGenerator; -import org.apache.flink.api.common.eventtime.WatermarkGeneratorSupplier; -import org.apache.flink.api.common.eventtime.WatermarkOutput; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.common.operators.util.OperatorValidationUtils; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy; -import org.apache.flink.api.dag.Transformation; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.DataStreamUtils; -import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.graph.StreamGraphGenerator; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.maintenance.operator.LockRemover; -import org.apache.iceberg.flink.maintenance.operator.MonitorSource; -import org.apache.iceberg.flink.maintenance.operator.TableChange; -import org.apache.iceberg.flink.maintenance.operator.TriggerEvaluator; -import org.apache.iceberg.flink.maintenance.operator.TriggerManager; -import org.apache.iceberg.flink.sink.IcebergSink; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -/** Creates the table maintenance graph. */ -public class TableMaintenance { - static final String SOURCE_OPERATOR_NAME_PREFIX = "Monitor source for "; - static final String TRIGGER_MANAGER_OPERATOR_NAME = "Trigger manager"; - static final String WATERMARK_ASSIGNER_OPERATOR_NAME = "Watermark Assigner"; - static final String FILTER_OPERATOR_NAME_PREFIX = "Filter "; - static final String LOCK_REMOVER_OPERATOR_NAME = "Lock remover"; - - static final long RATE_LIMIT_SECOND_DEFAULT = 60; - static final long LOCK_CHECK_DELAY_SECOND_DEFAULT = 30; - static final int MAX_READ_BACK_DEFAULT = 100; - - private TableMaintenance() {} - - /** - * Use when the change stream is already provided, like in the {@link - * IcebergSink#addPostCommitTopology(DataStream)}. - * - * @param changeStream the table changes - * @param tableLoader used for accessing the table - * @param lockFactory used for preventing concurrent task runs - * @return builder for the maintenance stream - */ - @Internal - public static Builder forChangeStream( - DataStream changeStream, - TableLoader tableLoader, - TriggerLockFactory lockFactory) { - Preconditions.checkNotNull(changeStream, "The change stream should not be null"); - Preconditions.checkNotNull(tableLoader, "TableLoader should not be null"); - Preconditions.checkNotNull(lockFactory, "LockFactory should not be null"); - - return new Builder(null, changeStream, tableLoader, lockFactory); - } - - /** - * Use this for standalone maintenance job. It creates a monitor source that detect table changes - * and build the maintenance pipelines afterwards. - * - * @param env used to register the monitor source - * @param tableLoader used for accessing the table - * @param lockFactory used for preventing concurrent task runs - * @return builder for the maintenance stream - */ - public static Builder forTable( - StreamExecutionEnvironment env, TableLoader tableLoader, TriggerLockFactory lockFactory) { - Preconditions.checkNotNull(env, "StreamExecutionEnvironment should not be null"); - Preconditions.checkNotNull(tableLoader, "TableLoader should not be null"); - Preconditions.checkNotNull(lockFactory, "LockFactory should not be null"); - - return new Builder(env, null, tableLoader, lockFactory); - } - - public static class Builder { - private final StreamExecutionEnvironment env; - private final DataStream inputStream; - private final TableLoader tableLoader; - private final List> taskBuilders; - private final TriggerLockFactory lockFactory; - - private String uidSuffix = "TableMaintenance-" + UUID.randomUUID(); - private String slotSharingGroup = StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP; - private Duration rateLimit = Duration.ofSeconds(RATE_LIMIT_SECOND_DEFAULT); - private Duration lockCheckDelay = Duration.ofSeconds(LOCK_CHECK_DELAY_SECOND_DEFAULT); - private int parallelism = ExecutionConfig.PARALLELISM_DEFAULT; - private int maxReadBack = MAX_READ_BACK_DEFAULT; - - private Builder( - StreamExecutionEnvironment env, - DataStream inputStream, - TableLoader tableLoader, - TriggerLockFactory lockFactory) { - this.env = env; - this.inputStream = inputStream; - this.tableLoader = tableLoader; - this.lockFactory = lockFactory; - this.taskBuilders = Lists.newArrayListWithCapacity(4); - } - - /** - * The suffix used for the generated {@link Transformation}'s uid. - * - * @param newUidSuffix for the transformations - */ - public Builder uidSuffix(String newUidSuffix) { - this.uidSuffix = newUidSuffix; - return this; - } - - /** - * The {@link SingleOutputStreamOperator#slotSharingGroup(String)} for all the operators of the - * generated stream. Could be used to separate the resources used by this task. - * - * @param newSlotSharingGroup to be used for the operators - */ - public Builder slotSharingGroup(String newSlotSharingGroup) { - this.slotSharingGroup = newSlotSharingGroup; - return this; - } - - /** - * Limits the firing frequency for the task triggers. - * - * @param newRateLimit firing frequency - */ - public Builder rateLimit(Duration newRateLimit) { - Preconditions.checkNotNull(newRateLimit, "Rate limit should not be null"); - Preconditions.checkArgument( - newRateLimit.toMillis() > 0, "Rate limit should be greater than 0"); - this.rateLimit = newRateLimit; - return this; - } - - /** - * Sets the delay for checking lock availability when a concurrent run is detected. - * - * @param newLockCheckDelay lock checking frequency - */ - public Builder lockCheckDelay(Duration newLockCheckDelay) { - this.lockCheckDelay = newLockCheckDelay; - return this; - } - - /** - * Sets the default parallelism of maintenance tasks. Could be overwritten by the {@link - * MaintenanceTaskBuilder#parallelism(int)}. - * - * @param newParallelism task parallelism - */ - public Builder parallelism(int newParallelism) { - OperatorValidationUtils.validateParallelism(newParallelism); - this.parallelism = newParallelism; - return this; - } - - /** - * Maximum number of snapshots checked when started with an embedded {@link MonitorSource} at - * the first time. Only available when the {@link - * TableMaintenance#forTable(StreamExecutionEnvironment, TableLoader, TriggerLockFactory)} is - * used. - * - * @param newMaxReadBack snapshots to consider when initializing - */ - public Builder maxReadBack(int newMaxReadBack) { - Preconditions.checkArgument( - inputStream == null, "Can't set maxReadBack when change stream is provided"); - this.maxReadBack = newMaxReadBack; - return this; - } - - /** - * Adds a specific task with the given schedule. - * - * @param task to add - */ - public Builder add(MaintenanceTaskBuilder task) { - taskBuilders.add(task); - return this; - } - - /** Builds the task graph for the maintenance tasks. */ - public void append() throws IOException { - Preconditions.checkArgument(!taskBuilders.isEmpty(), "Provide at least one task"); - Preconditions.checkNotNull(uidSuffix, "Uid suffix should no be null"); - - List taskNames = Lists.newArrayListWithCapacity(taskBuilders.size()); - List evaluators = Lists.newArrayListWithCapacity(taskBuilders.size()); - for (int i = 0; i < taskBuilders.size(); ++i) { - taskNames.add(nameFor(taskBuilders.get(i), i)); - evaluators.add(taskBuilders.get(i).evaluator()); - } - - try (TableLoader loader = tableLoader.clone()) { - loader.open(); - String tableName = loader.loadTable().name(); - DataStream triggers = - DataStreamUtils.reinterpretAsKeyedStream( - changeStream(tableName, loader), unused -> true) - .process( - new TriggerManager( - loader, - lockFactory, - taskNames, - evaluators, - rateLimit.toMillis(), - lockCheckDelay.toMillis())) - .name(TRIGGER_MANAGER_OPERATOR_NAME) - .uid(TRIGGER_MANAGER_OPERATOR_NAME + uidSuffix) - .slotSharingGroup(slotSharingGroup) - .forceNonParallel() - .assignTimestampsAndWatermarks(new PunctuatedWatermarkStrategy()) - .name(WATERMARK_ASSIGNER_OPERATOR_NAME) - .uid(WATERMARK_ASSIGNER_OPERATOR_NAME + uidSuffix) - .slotSharingGroup(slotSharingGroup) - .forceNonParallel(); - - // Add the specific tasks - DataStream unioned = null; - for (int i = 0; i < taskBuilders.size(); ++i) { - int taskIndex = i; - DataStream filtered = - triggers - .filter(t -> t.taskId() != null && t.taskId() == taskIndex) - .name(FILTER_OPERATOR_NAME_PREFIX + taskIndex) - .forceNonParallel() - .uid(FILTER_OPERATOR_NAME_PREFIX + taskIndex + "-" + uidSuffix) - .slotSharingGroup(slotSharingGroup); - MaintenanceTaskBuilder builder = taskBuilders.get(taskIndex); - DataStream result = - builder.append( - filtered, - tableName, - taskNames.get(taskIndex), - taskIndex, - loader, - uidSuffix, - slotSharingGroup, - parallelism); - if (unioned == null) { - unioned = result; - } else { - unioned = unioned.union(result); - } - } - - // Add the LockRemover to the end - unioned - .transform( - LOCK_REMOVER_OPERATOR_NAME, - TypeInformation.of(Void.class), - new LockRemover(tableName, lockFactory, taskNames)) - .forceNonParallel() - .uid("lock-remover-" + uidSuffix) - .slotSharingGroup(slotSharingGroup); - } - } - - private DataStream changeStream(String tableName, TableLoader loader) { - if (inputStream == null) { - // Create a monitor source to provide the TableChange stream - MonitorSource source = - new MonitorSource( - loader, RateLimiterStrategy.perSecond(1.0 / rateLimit.getSeconds()), maxReadBack); - return env.fromSource( - source, WatermarkStrategy.noWatermarks(), SOURCE_OPERATOR_NAME_PREFIX + tableName) - .uid(SOURCE_OPERATOR_NAME_PREFIX + uidSuffix) - .slotSharingGroup(slotSharingGroup) - .forceNonParallel(); - } else { - return inputStream.global(); - } - } - - private static String nameFor(MaintenanceTaskBuilder streamBuilder, int taskIndex) { - return String.format(Locale.ROOT, "%s [%d]", streamBuilder.maintenanceTaskName(), taskIndex); - } - } - - @Internal - public static class PunctuatedWatermarkStrategy implements WatermarkStrategy { - @Override - public WatermarkGenerator createWatermarkGenerator( - WatermarkGeneratorSupplier.Context context) { - return new WatermarkGenerator<>() { - @Override - public void onEvent(Trigger event, long eventTimestamp, WatermarkOutput output) { - output.emitWatermark(new Watermark(event.timestamp())); - } - - @Override - public void onPeriodicEmit(WatermarkOutput output) { - // No periodic watermarks - } - }; - } - - @Override - public TimestampAssigner createTimestampAssigner( - TimestampAssignerSupplier.Context context) { - return (element, unused) -> element.timestamp(); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TaskResult.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TaskResult.java deleted file mode 100644 index ca1462526f13..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TaskResult.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import java.io.Serializable; -import java.util.List; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; - -/** The result of a single Maintenance Task. */ -public class TaskResult implements Serializable { - private final int taskIndex; - private final long startEpoch; - private final boolean success; - private final List exceptions; - - public TaskResult(int taskIndex, long startEpoch, boolean success, List exceptions) { - this.taskIndex = taskIndex; - this.startEpoch = startEpoch; - this.success = success; - this.exceptions = exceptions; - } - - public int taskIndex() { - return taskIndex; - } - - public long startEpoch() { - return startEpoch; - } - - public boolean success() { - return success; - } - - public List exceptions() { - return exceptions; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("taskIndex", taskIndex) - .add("startEpoch", startEpoch) - .add("success", success) - .add("exceptions", exceptions) - .toString(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/Trigger.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/Trigger.java deleted file mode 100644 index 09209ba15153..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/Trigger.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; - -public class Trigger { - private final long timestamp; - private final Integer taskId; - private final boolean isRecovery; - - private Trigger(long timestamp, Integer taskId, boolean isRecovery) { - this.timestamp = timestamp; - this.taskId = taskId; - this.isRecovery = isRecovery; - } - - @Internal - public static Trigger create(long timestamp, int taskId) { - return new Trigger(timestamp, taskId, false); - } - - @Internal - public static Trigger recovery(long timestamp) { - return new Trigger(timestamp, null, true); - } - - public long timestamp() { - return timestamp; - } - - public Integer taskId() { - return taskId; - } - - public boolean isRecovery() { - return isRecovery; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("timestamp", timestamp) - .add("taskId", taskId) - .add("isRecovery", isRecovery) - .toString(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TriggerLockFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TriggerLockFactory.java deleted file mode 100644 index c31381355efe..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/TriggerLockFactory.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import java.io.Closeable; -import java.io.Serializable; -import org.apache.flink.annotation.Experimental; -import org.apache.iceberg.flink.maintenance.operator.LockRemover; -import org.apache.iceberg.flink.maintenance.operator.TriggerManager; - -/** Lock interface for handling locks for the Flink Table Maintenance jobs. */ -@Experimental -public interface TriggerLockFactory extends Serializable, Closeable { - void open(); - - Lock createLock(); - - Lock createRecoveryLock(); - - interface Lock { - /** - * Tries to acquire a lock with a given key. Anyone already holding a lock would prevent - * acquiring this lock. Not reentrant. - * - *

    Called by {@link TriggerManager}. Implementations could assume that are no concurrent - * calls for this method. - * - * @return true if the lock is acquired by this job, false if the lock - * is already held by someone - */ - boolean tryLock(); - - /** - * Checks if the lock is already taken. - * - * @return true if the lock is held by someone - */ - boolean isHeld(); - - /** - * Releases the lock. Should not fail if the lock is not held by anyone. - * - *

    Called by {@link LockRemover}. Implementations could assume that are no concurrent calls - * for this method. - */ - void unlock(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ZkLockFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ZkLockFactory.java deleted file mode 100644 index 539ba6b297c8..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/ZkLockFactory.java +++ /dev/null @@ -1,225 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import java.io.IOException; -import java.util.concurrent.TimeUnit; -import org.apache.flink.shaded.curator5.org.apache.curator.framework.CuratorFramework; -import org.apache.flink.shaded.curator5.org.apache.curator.framework.CuratorFrameworkFactory; -import org.apache.flink.shaded.curator5.org.apache.curator.framework.recipes.shared.SharedCount; -import org.apache.flink.shaded.curator5.org.apache.curator.framework.recipes.shared.VersionedValue; -import org.apache.flink.shaded.curator5.org.apache.curator.retry.ExponentialBackoffRetry; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** Zookeeper backed implementation of the {@link TriggerLockFactory}. */ -public class ZkLockFactory implements TriggerLockFactory { - private static final Logger LOG = LoggerFactory.getLogger(ZkLockFactory.class); - - private static final String LOCK_BASE_PATH = "/iceberg/flink/maintenance/locks/"; - - private final String connectString; - private final String lockId; - private final int sessionTimeoutMs; - private final int connectionTimeoutMs; - private final int baseSleepTimeMs; - private final int maxRetries; - private transient CuratorFramework client; - private transient SharedCount taskSharedCount; - private transient SharedCount recoverySharedCount; - private volatile boolean isOpen; - - /** - * Create Zookeeper lock factory - * - * @param connectString Zookeeper connection string - * @param lockId which should identify the job and the table - * @param sessionTimeoutMs Session timeout in milliseconds - * @param connectionTimeoutMs Connection timeout in milliseconds - * @param baseSleepTimeMs Base sleep time in milliseconds - * @param maxRetries Maximum number of retries - */ - public ZkLockFactory( - String connectString, - String lockId, - int sessionTimeoutMs, - int connectionTimeoutMs, - int baseSleepTimeMs, - int maxRetries) { - Preconditions.checkNotNull(connectString, "Zookeeper connection string cannot be null"); - Preconditions.checkNotNull(lockId, "Lock ID cannot be null"); - Preconditions.checkArgument( - sessionTimeoutMs >= 0, "Session timeout must be positive, got: %s", sessionTimeoutMs); - Preconditions.checkArgument( - connectionTimeoutMs >= 0, - "Connection timeout must be positive, got: %s", - connectionTimeoutMs); - Preconditions.checkArgument( - baseSleepTimeMs >= 0, "Base sleep time must be positive, got: %s", baseSleepTimeMs); - Preconditions.checkArgument( - maxRetries >= 0, "Max retries must be non-negative, got: %s", maxRetries); - this.connectString = connectString; - this.lockId = lockId; - this.sessionTimeoutMs = sessionTimeoutMs; - this.connectionTimeoutMs = connectionTimeoutMs; - this.baseSleepTimeMs = baseSleepTimeMs; - this.maxRetries = maxRetries; - } - - @Override - public void open() { - if (isOpen) { - LOG.debug("ZkLockFactory already opened for lockId: {}.", lockId); - return; - } - - this.client = - CuratorFrameworkFactory.builder() - .connectString(connectString) - .sessionTimeoutMs(sessionTimeoutMs) - .connectionTimeoutMs(connectionTimeoutMs) - .retryPolicy(new ExponentialBackoffRetry(baseSleepTimeMs, maxRetries)) - .build(); - client.start(); - - try { - if (!client.blockUntilConnected(connectionTimeoutMs, TimeUnit.MILLISECONDS)) { - throw new IllegalStateException("Connection to Zookeeper timed out"); - } - - this.taskSharedCount = new SharedCount(client, getTaskSharePath(), 0); - this.recoverySharedCount = new SharedCount(client, getRecoverySharedPath(), 0); - taskSharedCount.start(); - recoverySharedCount.start(); - isOpen = true; - LOG.info("ZkLockFactory initialized for lockId: {}.", lockId); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new RuntimeException("Interrupted while connecting to Zookeeper", e); - } catch (Exception e) { - closeQuietly(); - throw new RuntimeException("Failed to initialize SharedCount", e); - } - } - - private String getTaskSharePath() { - return LOCK_BASE_PATH + lockId + "/task"; - } - - private String getRecoverySharedPath() { - return LOCK_BASE_PATH + lockId + "/recovery"; - } - - private void closeQuietly() { - try { - close(); - } catch (Exception e) { - LOG.warn("Failed to close ZkLockFactory for lockId: {}", lockId, e); - } - } - - @Override - public Lock createLock() { - return new ZkLock(getTaskSharePath(), taskSharedCount); - } - - @Override - public Lock createRecoveryLock() { - return new ZkLock(getRecoverySharedPath(), recoverySharedCount); - } - - @Override - public void close() throws IOException { - try { - if (taskSharedCount != null) { - taskSharedCount.close(); - } - - if (recoverySharedCount != null) { - recoverySharedCount.close(); - } - } finally { - if (client != null) { - client.close(); - } - - isOpen = false; - } - } - - /** Zookeeper lock implementation */ - private static class ZkLock implements Lock { - private final SharedCount sharedCount; - private final String lockPath; - - private static final int LOCKED = 1; - private static final int UNLOCKED = 0; - - private ZkLock(String lockPath, SharedCount sharedCount) { - this.lockPath = lockPath; - this.sharedCount = sharedCount; - } - - @Override - public boolean tryLock() { - VersionedValue versionedValue = sharedCount.getVersionedValue(); - if (isHeld(versionedValue)) { - LOG.debug("Lock is already held for path: {}", lockPath); - return false; - } - - try { - boolean acquired = sharedCount.trySetCount(versionedValue, LOCKED); - if (!acquired) { - LOG.debug("Failed to acquire lock for path: {}", lockPath); - } - - return acquired; - } catch (Exception e) { - LOG.warn("Failed to acquire Zookeeper lock", e); - return false; - } - } - - @Override - public boolean isHeld() { - return isHeld(sharedCount.getVersionedValue()); - } - - private static boolean isHeld(VersionedValue versionedValue) { - try { - return versionedValue.getValue() == LOCKED; - } catch (Exception e) { - throw new RuntimeException("Failed to check Zookeeper lock status", e); - } - } - - @Override - public void unlock() { - try { - sharedCount.setCount(UNLOCKED); - LOG.debug("Released lock for path: {}", lockPath); - } catch (Exception e) { - LOG.warn("Failed to release lock for path: {}", lockPath, e); - throw new RuntimeException("Failed to release lock", e); - } - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteCommitter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteCommitter.java deleted file mode 100644 index 135d3d9b42db..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteCommitter.java +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import java.io.IOException; -import java.util.Set; -import org.apache.flink.annotation.Internal; -import org.apache.flink.metrics.Counter; -import org.apache.flink.metrics.MetricGroup; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.Table; -import org.apache.iceberg.actions.RewriteDataFilesCommitManager; -import org.apache.iceberg.actions.RewriteDataFilesCommitManager.CommitService; -import org.apache.iceberg.actions.RewriteFileGroup; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Commits the rewrite changes using {@link RewriteDataFilesCommitManager}. The input is a {@link - * DataFileRewriteRunner.ExecutedGroup}. Only {@link Watermark} is emitted which is chained to - * {@link TaskResultAggregator} input 1. - */ -@Internal -public class DataFileRewriteCommitter extends AbstractStreamOperator - implements OneInputStreamOperator { - private static final Logger LOG = LoggerFactory.getLogger(DataFileRewriteCommitter.class); - - private final String tableName; - private final String taskName; - private final int taskIndex; - private final TableLoader tableLoader; - - private transient Table table; - private transient CommitService commitService; - private transient Counter errorCounter; - private transient Counter addedDataFileNumCounter; - private transient Counter addedDataFileSizeCounter; - private transient Counter removedDataFileNumCounter; - private transient Counter removedDataFileSizeCounter; - - public DataFileRewriteCommitter( - String tableName, String taskName, int taskIndex, TableLoader tableLoader) { - Preconditions.checkNotNull(tableName, "Table name should no be null"); - Preconditions.checkNotNull(taskName, "Task name should no be null"); - Preconditions.checkNotNull(tableLoader, "Table loader should no be null"); - - this.tableName = tableName; - this.taskName = taskName; - this.taskIndex = taskIndex; - this.tableLoader = tableLoader; - } - - @Override - public void open() throws Exception { - super.open(); - - tableLoader.open(); - this.table = tableLoader.loadTable(); - - MetricGroup taskMetricGroup = - TableMaintenanceMetrics.groupFor(getRuntimeContext(), tableName, taskName, taskIndex); - this.errorCounter = taskMetricGroup.counter(TableMaintenanceMetrics.ERROR_COUNTER); - this.addedDataFileNumCounter = - taskMetricGroup.counter(TableMaintenanceMetrics.ADDED_DATA_FILE_NUM_METRIC); - this.addedDataFileSizeCounter = - taskMetricGroup.counter(TableMaintenanceMetrics.ADDED_DATA_FILE_SIZE_METRIC); - this.removedDataFileNumCounter = - taskMetricGroup.counter(TableMaintenanceMetrics.REMOVED_DATA_FILE_NUM_METRIC); - this.removedDataFileSizeCounter = - taskMetricGroup.counter(TableMaintenanceMetrics.REMOVED_DATA_FILE_SIZE_METRIC); - } - - @Override - public void processElement(StreamRecord streamRecord) { - DataFileRewriteRunner.ExecutedGroup executedGroup = streamRecord.getValue(); - try { - if (commitService == null) { - // Refresh the table to get the latest snapshot for the committer - table.refresh(); - - FlinkRewriteDataFilesCommitManager commitManager = - new FlinkRewriteDataFilesCommitManager( - table, executedGroup.snapshotId(), streamRecord.getTimestamp()); - this.commitService = commitManager.service(executedGroup.groupsPerCommit()); - commitService.start(); - } - - commitService.offer(executedGroup.group()); - } catch (Exception e) { - LOG.warn( - DataFileRewritePlanner.MESSAGE_PREFIX + "Exception processing {}", - tableName, - taskName, - taskIndex, - streamRecord.getTimestamp(), - executedGroup, - e); - output.collect(TaskResultAggregator.ERROR_STREAM, new StreamRecord<>(e)); - errorCounter.inc(); - } - } - - @Override - public void processWatermark(Watermark mark) throws Exception { - try { - if (commitService != null) { - commitService.close(); - } - - LOG.info( - DataFileRewritePlanner.MESSAGE_PREFIX + "Successfully completed data file compaction", - tableName, - taskName, - taskIndex, - mark.getTimestamp()); - } catch (Exception e) { - LOG.warn( - DataFileRewritePlanner.MESSAGE_PREFIX + "Exception closing commit service", - tableName, - taskName, - taskIndex, - mark.getTimestamp(), - e); - output.collect(TaskResultAggregator.ERROR_STREAM, new StreamRecord<>(e)); - errorCounter.inc(); - } - - // Cleanup - this.commitService = null; - - super.processWatermark(mark); - } - - @Override - public void close() throws IOException { - if (commitService != null) { - commitService.close(); - } - } - - private class FlinkRewriteDataFilesCommitManager extends RewriteDataFilesCommitManager { - private final long timestamp; - - FlinkRewriteDataFilesCommitManager(Table table, long startingSnapshotId, long timestamp) { - super(table, startingSnapshotId); - this.timestamp = timestamp; - } - - @Override - public void commitFileGroups(Set fileGroups) { - super.commitFileGroups(fileGroups); - LOG.info( - DataFileRewritePlanner.MESSAGE_PREFIX + "Committed {}", - tableName, - taskName, - taskIndex, - timestamp, - fileGroups); - updateMetrics(fileGroups); - } - - private void updateMetrics(Set fileGroups) { - for (RewriteFileGroup fileGroup : fileGroups) { - for (DataFile added : fileGroup.addedFiles()) { - addedDataFileNumCounter.inc(); - addedDataFileSizeCounter.inc(added.fileSizeInBytes()); - } - - for (DataFile rewritten : fileGroup.rewrittenFiles()) { - removedDataFileNumCounter.inc(); - removedDataFileSizeCounter.inc(rewritten.fileSizeInBytes()); - } - } - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java deleted file mode 100644 index 5403dfe19aae..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import java.math.RoundingMode; -import java.util.List; -import java.util.Map; -import org.apache.flink.annotation.Internal; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.metrics.Counter; -import org.apache.flink.streaming.api.functions.ProcessFunction; -import org.apache.flink.util.Collector; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableUtil; -import org.apache.iceberg.actions.BinPackRewriteFilePlanner; -import org.apache.iceberg.actions.FileRewritePlan; -import org.apache.iceberg.actions.RewriteDataFiles; -import org.apache.iceberg.actions.RewriteFileGroup; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.math.IntMath; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Plans the rewrite groups using the {@link BinPackRewriteFilePlanner}. The input is the {@link - * Trigger}, the output is zero, or some {@link PlannedGroup}s. - */ -@Internal -public class DataFileRewritePlanner - extends ProcessFunction { - static final String MESSAGE_PREFIX = "[For table {} with {}[{}] at {}]: "; - private static final Logger LOG = LoggerFactory.getLogger(DataFileRewritePlanner.class); - - private final String tableName; - private final String taskName; - private final int taskIndex; - private final TableLoader tableLoader; - private final int partialProgressMaxCommits; - private final long maxRewriteBytes; - private final Map rewriterOptions; - private transient Counter errorCounter; - private final Expression filter; - - public DataFileRewritePlanner( - String tableName, - String taskName, - int taskIndex, - TableLoader tableLoader, - int newPartialProgressMaxCommits, - long maxRewriteBytes, - Map rewriterOptions, - Expression filter) { - - Preconditions.checkNotNull(tableName, "Table name should no be null"); - Preconditions.checkNotNull(taskName, "Task name should no be null"); - Preconditions.checkNotNull(tableLoader, "Table loader should no be null"); - Preconditions.checkNotNull(rewriterOptions, "Options map should no be null"); - - this.tableName = tableName; - this.taskName = taskName; - this.taskIndex = taskIndex; - this.tableLoader = tableLoader; - this.partialProgressMaxCommits = newPartialProgressMaxCommits; - this.maxRewriteBytes = maxRewriteBytes; - this.rewriterOptions = rewriterOptions; - this.filter = filter; - } - - @Override - public void open(Configuration parameters) throws Exception { - tableLoader.open(); - Table table = tableLoader.loadTable(); - Preconditions.checkArgument( - !TableUtil.supportsRowLineage(table), - "Flink does not support compaction on row lineage enabled tables (V3+)"); - this.errorCounter = - TableMaintenanceMetrics.groupFor(getRuntimeContext(), tableName, taskName, taskIndex) - .counter(TableMaintenanceMetrics.ERROR_COUNTER); - } - - @Override - public void processElement(Trigger value, Context ctx, Collector out) - throws Exception { - LOG.info( - DataFileRewritePlanner.MESSAGE_PREFIX + "Creating rewrite plan", - tableName, - taskName, - taskIndex, - ctx.timestamp()); - try { - SerializableTable table = - (SerializableTable) SerializableTable.copyOf(tableLoader.loadTable()); - if (table.currentSnapshot() == null) { - LOG.info( - DataFileRewritePlanner.MESSAGE_PREFIX + "Nothing to plan for in an empty table", - tableName, - taskName, - taskIndex, - ctx.timestamp()); - return; - } - - BinPackRewriteFilePlanner planner = new BinPackRewriteFilePlanner(table, filter); - planner.init(rewriterOptions); - - FileRewritePlan - plan = planner.plan(); - - long rewriteBytes = 0; - List groups = Lists.newArrayList(); - for (CloseableIterator groupIterator = plan.groups().iterator(); - groupIterator.hasNext(); ) { - RewriteFileGroup group = groupIterator.next(); - if (rewriteBytes + group.inputFilesSizeInBytes() > maxRewriteBytes) { - // Keep going, maybe some other group might fit in - LOG.info( - DataFileRewritePlanner.MESSAGE_PREFIX - + "Skipping group as max rewrite size reached {}", - tableName, - taskName, - taskIndex, - ctx.timestamp(), - group); - } else { - rewriteBytes += group.inputFilesSizeInBytes(); - groups.add(group); - } - } - - int groupsPerCommit = - IntMath.divide(groups.size(), partialProgressMaxCommits, RoundingMode.CEILING); - - LOG.info( - DataFileRewritePlanner.MESSAGE_PREFIX + "Rewrite plan created {}", - tableName, - taskName, - taskIndex, - ctx.timestamp(), - groups); - - for (RewriteFileGroup group : groups) { - LOG.info( - DataFileRewritePlanner.MESSAGE_PREFIX + "Emitting {}", - tableName, - taskName, - taskIndex, - ctx.timestamp(), - group); - out.collect(new PlannedGroup(table, groupsPerCommit, group)); - } - } catch (Exception e) { - LOG.warn( - DataFileRewritePlanner.MESSAGE_PREFIX + "Failed to plan data file rewrite groups", - tableName, - taskName, - taskIndex, - ctx.timestamp(), - e); - ctx.output(TaskResultAggregator.ERROR_STREAM, e); - errorCounter.inc(); - } - } - - @Override - public void close() throws Exception { - super.close(); - tableLoader.close(); - } - - public static class PlannedGroup { - private final SerializableTable table; - private final int groupsPerCommit; - private final RewriteFileGroup group; - - private PlannedGroup(SerializableTable table, int groupsPerCommit, RewriteFileGroup group) { - this.table = table; - this.groupsPerCommit = groupsPerCommit; - this.group = group; - } - - SerializableTable table() { - return table; - } - - int groupsPerCommit() { - return groupsPerCommit; - } - - RewriteFileGroup group() { - return group; - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteRunner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteRunner.java deleted file mode 100644 index c03b5cc1c8fd..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewriteRunner.java +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; - -import java.util.Collections; -import java.util.Set; -import org.apache.flink.annotation.Internal; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.metrics.Counter; -import org.apache.flink.streaming.api.functions.ProcessFunction; -import org.apache.flink.table.data.RowData; -import org.apache.flink.util.Collector; -import org.apache.iceberg.BaseCombinedScanTask; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.actions.RewriteFileGroup; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.maintenance.operator.DataFileRewritePlanner.PlannedGroup; -import org.apache.iceberg.flink.sink.RowDataTaskWriterFactory; -import org.apache.iceberg.flink.sink.TaskWriterFactory; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.flink.source.FileScanTaskReader; -import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.util.PropertyUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Executes a rewrite for a single {@link PlannedGroup}. Reads the files with the standard {@link - * FileScanTaskReader}, so the delete files are considered, and writes using the {@link - * TaskWriterFactory}. The output is an {@link ExecutedGroup}. - */ -@Internal -public class DataFileRewriteRunner - extends ProcessFunction { - private static final Logger LOG = LoggerFactory.getLogger(DataFileRewriteRunner.class); - - private final String tableName; - private final String taskName; - private final int taskIndex; - - private transient int subTaskId; - private transient int attemptId; - private transient Counter errorCounter; - - public DataFileRewriteRunner(String tableName, String taskName, int taskIndex) { - Preconditions.checkNotNull(tableName, "Table name should no be null"); - Preconditions.checkNotNull(taskName, "Task name should no be null"); - this.tableName = tableName; - this.taskName = taskName; - this.taskIndex = taskIndex; - } - - @Override - public void open(Configuration parameters) { - this.errorCounter = - TableMaintenanceMetrics.groupFor(getRuntimeContext(), tableName, taskName, taskIndex) - .counter(TableMaintenanceMetrics.ERROR_COUNTER); - - this.subTaskId = getRuntimeContext().getTaskInfo().getIndexOfThisSubtask(); - this.attemptId = getRuntimeContext().getTaskInfo().getAttemptNumber(); - } - - @Override - public void processElement(PlannedGroup value, Context ctx, Collector out) - throws Exception { - if (LOG.isDebugEnabled()) { - LOG.debug( - DataFileRewritePlanner.MESSAGE_PREFIX + "Rewriting files for group {} with files: {}", - tableName, - taskName, - taskIndex, - ctx.timestamp(), - value.group().info(), - value.group().rewrittenFiles()); - } else { - LOG.info( - DataFileRewritePlanner.MESSAGE_PREFIX - + "Rewriting files for group {} with {} number of files", - tableName, - taskName, - taskIndex, - ctx.timestamp(), - value.group().info(), - value.group().rewrittenFiles().size()); - } - - try (TaskWriter writer = writerFor(value)) { - try (DataIterator iterator = readerFor(value)) { - while (iterator.hasNext()) { - writer.write(iterator.next()); - } - - Set dataFiles = Sets.newHashSet(writer.dataFiles()); - value.group().setOutputFiles(dataFiles); - out.collect( - new ExecutedGroup( - value.table().currentSnapshot().snapshotId(), - value.groupsPerCommit(), - value.group())); - if (LOG.isDebugEnabled()) { - LOG.debug( - DataFileRewritePlanner.MESSAGE_PREFIX + "Rewritten files {} from {} to {}", - tableName, - taskName, - taskIndex, - ctx.timestamp(), - value.group().info(), - value.group().rewrittenFiles(), - value.group().addedFiles()); - } else { - LOG.info( - DataFileRewritePlanner.MESSAGE_PREFIX + "Rewritten {} files to {} files", - tableName, - taskName, - taskIndex, - ctx.timestamp(), - value.group().rewrittenFiles().size(), - value.group().addedFiles().size()); - } - } catch (Exception ex) { - LOG.info( - DataFileRewritePlanner.MESSAGE_PREFIX + "Exception rewriting datafile group {}", - tableName, - taskName, - taskIndex, - ctx.timestamp(), - value.group(), - ex); - ctx.output(TaskResultAggregator.ERROR_STREAM, ex); - errorCounter.inc(); - abort(writer, ctx.timestamp()); - } - } catch (Exception ex) { - LOG.info( - DataFileRewritePlanner.MESSAGE_PREFIX - + "Exception creating compaction writer for group {}", - tableName, - taskName, - taskIndex, - ctx.timestamp(), - value.group(), - ex); - ctx.output(TaskResultAggregator.ERROR_STREAM, ex); - errorCounter.inc(); - } - } - - private TaskWriter writerFor(PlannedGroup value) { - String formatString = - PropertyUtil.propertyAsString( - value.table().properties(), - TableProperties.DEFAULT_FILE_FORMAT, - TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); - RowDataTaskWriterFactory factory = - new RowDataTaskWriterFactory( - value.table(), - FlinkSchemaUtil.convert(value.table().schema()), - value.group().inputSplitSize(), - FileFormat.fromString(formatString), - value.table().properties(), - null, - false); - factory.initialize(subTaskId, attemptId); - return factory.create(); - } - - private DataIterator readerFor(PlannedGroup value) { - RowDataFileScanTaskReader reader = - new RowDataFileScanTaskReader( - value.table().schema(), - value.table().schema(), - PropertyUtil.propertyAsString(value.table().properties(), DEFAULT_NAME_MAPPING, null), - false, - Collections.emptyList()); - return new DataIterator<>( - reader, - new BaseCombinedScanTask(value.group().fileScanTasks()), - value.table().io(), - value.table().encryption()); - } - - private void abort(TaskWriter writer, long timestamp) { - try { - LOG.info( - DataFileRewritePlanner.MESSAGE_PREFIX - + "Aborting rewrite for (subTaskId {}, attemptId {})", - tableName, - taskName, - taskIndex, - timestamp, - subTaskId, - attemptId); - writer.abort(); - } catch (Exception e) { - LOG.info( - DataFileRewritePlanner.MESSAGE_PREFIX + "Exception in abort", - tableName, - taskName, - taskIndex, - timestamp, - e); - } - } - - public static class ExecutedGroup { - private final long snapshotId; - private final int groupsPerCommit; - private final RewriteFileGroup group; - - @VisibleForTesting - ExecutedGroup(long snapshotId, int groupsPerCommit, RewriteFileGroup group) { - this.snapshotId = snapshotId; - this.groupsPerCommit = groupsPerCommit; - this.group = group; - } - - long snapshotId() { - return snapshotId; - } - - int groupsPerCommit() { - return groupsPerCommit; - } - - RewriteFileGroup group() { - return group; - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DeleteFilesProcessor.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DeleteFilesProcessor.java deleted file mode 100644 index 9189f5f018a8..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DeleteFilesProcessor.java +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import java.util.Set; -import org.apache.flink.annotation.Internal; -import org.apache.flink.metrics.Counter; -import org.apache.flink.metrics.MetricGroup; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.iceberg.Table; -import org.apache.iceberg.io.BulkDeletionFailureException; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.SupportsBulkOperations; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** Delete the files using the {@link FileIO} which implements {@link SupportsBulkOperations}. */ -@Internal -public class DeleteFilesProcessor extends AbstractStreamOperator - implements OneInputStreamOperator { - private static final Logger LOG = LoggerFactory.getLogger(DeleteFilesProcessor.class); - - private final String tableName; - private final String taskName; - private final int taskIndex; - private final SupportsBulkOperations io; - private final Set filesToDelete = Sets.newHashSet(); - private final int batchSize; - - private transient Counter failedCounter; - private transient Counter succeededCounter; - - public DeleteFilesProcessor(Table table, String taskName, int taskIndex, int batchSize) { - Preconditions.checkNotNull(taskName, "Task name should no be null"); - Preconditions.checkNotNull(table, "Table should no be null"); - - FileIO fileIO = table.io(); - Preconditions.checkArgument( - fileIO instanceof SupportsBulkOperations, - "%s doesn't support bulk delete", - fileIO.getClass().getSimpleName()); - - this.tableName = table.name(); - this.taskName = taskName; - this.taskIndex = taskIndex; - this.io = (SupportsBulkOperations) fileIO; - this.batchSize = batchSize; - } - - @Override - public void open() throws Exception { - MetricGroup taskMetricGroup = - TableMaintenanceMetrics.groupFor(getRuntimeContext(), tableName, taskName, taskIndex); - this.failedCounter = - taskMetricGroup.counter(TableMaintenanceMetrics.DELETE_FILE_FAILED_COUNTER); - this.succeededCounter = - taskMetricGroup.counter(TableMaintenanceMetrics.DELETE_FILE_SUCCEEDED_COUNTER); - } - - @Override - public void processElement(StreamRecord element) throws Exception { - if (element.isRecord()) { - filesToDelete.add(element.getValue()); - } - - if (filesToDelete.size() >= batchSize) { - deleteFiles(); - } - } - - @Override - public void processWatermark(Watermark mark) { - deleteFiles(); - } - - @Override - public void prepareSnapshotPreBarrier(long checkpointId) { - deleteFiles(); - } - - private void deleteFiles() { - try { - io.deleteFiles(filesToDelete); - LOG.info( - "Deleted {} files from table {} using bulk deletes", filesToDelete.size(), tableName); - succeededCounter.inc(filesToDelete.size()); - filesToDelete.clear(); - } catch (BulkDeletionFailureException e) { - int deletedFilesCount = filesToDelete.size() - e.numberFailedObjects(); - LOG.warn( - "Deleted only {} of {} files from table {} using bulk deletes", - deletedFilesCount, - filesToDelete.size(), - tableName, - e); - succeededCounter.inc(deletedFilesCount); - failedCounter.inc(e.numberFailedObjects()); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ExpireSnapshotsProcessor.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ExpireSnapshotsProcessor.java deleted file mode 100644 index 154512e27ba7..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ExpireSnapshotsProcessor.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import java.util.Collections; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.atomic.AtomicLong; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.typeinfo.Types; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.streaming.api.functions.ProcessFunction; -import org.apache.flink.util.Collector; -import org.apache.flink.util.OutputTag; -import org.apache.iceberg.ExpireSnapshots; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.maintenance.api.TaskResult; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.ThreadPools; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Calls the {@link ExpireSnapshots} to remove the old snapshots and emits the filenames which could - * be removed in the {@link #DELETE_STREAM} side output. - */ -@Internal -public class ExpireSnapshotsProcessor extends ProcessFunction { - private static final Logger LOG = LoggerFactory.getLogger(ExpireSnapshotsProcessor.class); - public static final OutputTag DELETE_STREAM = - new OutputTag<>("expire-snapshots-file-deletes-stream", Types.STRING); - - private final TableLoader tableLoader; - private final Long maxSnapshotAgeMs; - private final Integer numSnapshots; - private final Integer plannerPoolSize; - private final Boolean cleanExpiredMetadata; - private transient ExecutorService plannerPool; - private transient Table table; - - public ExpireSnapshotsProcessor( - TableLoader tableLoader, - Long maxSnapshotAgeMs, - Integer numSnapshots, - Integer plannerPoolSize, - Boolean cleanExpiredMetadata) { - Preconditions.checkNotNull(tableLoader, "Table loader should not be null"); - - this.tableLoader = tableLoader; - this.maxSnapshotAgeMs = maxSnapshotAgeMs; - this.numSnapshots = numSnapshots; - this.plannerPoolSize = plannerPoolSize; - this.cleanExpiredMetadata = cleanExpiredMetadata; - } - - @Override - public void open(Configuration parameters) throws Exception { - tableLoader.open(); - this.table = tableLoader.loadTable(); - this.plannerPool = - plannerPoolSize != null - ? ThreadPools.newFixedThreadPool(table.name() + "-table--planner", plannerPoolSize) - : ThreadPools.getWorkerPool(); - } - - @Override - public void processElement(Trigger trigger, Context ctx, Collector out) - throws Exception { - try { - table.refresh(); - ExpireSnapshots expireSnapshots = table.expireSnapshots(); - if (maxSnapshotAgeMs != null) { - expireSnapshots = expireSnapshots.expireOlderThan(ctx.timestamp() - maxSnapshotAgeMs); - } - - if (numSnapshots != null) { - expireSnapshots = expireSnapshots.retainLast(numSnapshots); - } - - if (cleanExpiredMetadata != null) { - expireSnapshots.cleanExpiredMetadata(cleanExpiredMetadata); - } - - AtomicLong deleteFileCounter = new AtomicLong(0L); - expireSnapshots - .planWith(plannerPool) - .deleteWith( - file -> { - ctx.output(DELETE_STREAM, file); - deleteFileCounter.incrementAndGet(); - }) - .cleanExpiredFiles(true) - .commit(); - - LOG.info( - "Successfully finished expiring snapshots for {} at {}. Scheduled {} files for delete.", - table, - ctx.timestamp(), - deleteFileCounter.get()); - out.collect( - new TaskResult(trigger.taskId(), trigger.timestamp(), true, Collections.emptyList())); - } catch (Exception e) { - LOG.error("Failed to expiring snapshots for {} at {}", table, ctx.timestamp(), e); - out.collect( - new TaskResult(trigger.taskId(), trigger.timestamp(), false, Lists.newArrayList(e))); - } - } - - @Override - public void close() throws Exception { - super.close(); - - tableLoader.close(); - if (plannerPoolSize != null) { - plannerPool.shutdown(); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileNameReader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileNameReader.java deleted file mode 100644 index 98610346aa18..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileNameReader.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.table.data.RowData; -import org.apache.flink.util.Collector; -import org.apache.iceberg.MetadataTableType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.source.ScanContext; - -/** A specialized reader implementation that extracts file names from Iceberg table rows. */ -@Internal -public class FileNameReader extends TableReader { - - public FileNameReader( - String taskName, - int taskIndex, - TableLoader tableLoader, - Schema projectedSchema, - ScanContext scanContext, - MetadataTableType metadataTableType) { - super(taskName, taskIndex, tableLoader, projectedSchema, scanContext, metadataTableType); - } - - @Override - void extract(RowData rowData, Collector out) { - if (rowData != null && rowData.getString(0) != null) { - out.collect(rowData.getString(0).toString()); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileUriKeySelector.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileUriKeySelector.java deleted file mode 100644 index 0ccf6a6ff08a..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/FileUriKeySelector.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import java.util.Map; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.actions.FileURI; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * A key selector implementation that extracts a normalized file path from a file URI string. - * - *

    This selector groups file URIs by their normalized path, ignoring differences in scheme and - * authority that are considered equivalent according to the provided mappings. - */ -@Internal -public class FileUriKeySelector implements KeySelector { - private static final Logger LOG = LoggerFactory.getLogger(FileUriKeySelector.class); - - static final String INVALID_URI = "__INVALID_URI__"; - - private final Map equalSchemes; - private final Map equalAuthorities; - - public FileUriKeySelector( - Map equalSchemes, Map equalAuthorities) { - this.equalSchemes = equalSchemes; - this.equalAuthorities = equalAuthorities; - } - - @Override - public String getKey(String value) throws Exception { - try { - FileURI fileUri = new FileURI(new Path(value).toUri(), equalSchemes, equalAuthorities); - return fileUri.getPath(); - } catch (Exception e) { - LOG.warn("Uri convert to FileURI error! Uri is {}.", value, e); - return INVALID_URI; - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListFileSystemFiles.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListFileSystemFiles.java deleted file mode 100644 index 1db95be8d3b6..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListFileSystemFiles.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import java.util.Map; -import java.util.function.Predicate; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.functions.OpenContext; -import org.apache.flink.metrics.Counter; -import org.apache.flink.streaming.api.functions.ProcessFunction; -import org.apache.flink.util.Collector; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.FileInfo; -import org.apache.iceberg.io.SupportsPrefixOperations; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.util.FileSystemWalker; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Recursively lists the files in the `location` directory. Hidden files, and files younger than the - * `minAgeMs` are omitted in the result. - */ -@Internal -public class ListFileSystemFiles extends ProcessFunction { - private static final Logger LOG = LoggerFactory.getLogger(ListFileSystemFiles.class); - - private final String taskName; - private final int taskIndex; - - private FileIO io; - private Map specs; - private String location; - private final long minAgeMs; - private transient Counter errorCounter; - private final TableLoader tableLoader; - private final boolean usePrefixListing; - private transient Configuration configuration; - - public ListFileSystemFiles( - String taskName, - int taskIndex, - TableLoader tableLoader, - String location, - long minAgeMs, - boolean usePrefixListing) { - Preconditions.checkNotNull(taskName, "Task name should no be null"); - Preconditions.checkNotNull(tableLoader, "TableLoad should no be null"); - - this.tableLoader = tableLoader; - this.taskName = taskName; - this.taskIndex = taskIndex; - this.minAgeMs = minAgeMs; - this.location = location; - this.usePrefixListing = usePrefixListing; - } - - @Override - public void open(OpenContext openContext) throws Exception { - super.open(openContext); - tableLoader.open(); - Table table = tableLoader.loadTable(); - this.io = table.io(); - this.location = location != null ? location : table.location(); - this.specs = table.specs(); - this.errorCounter = - TableMaintenanceMetrics.groupFor(getRuntimeContext(), table.name(), taskName, taskIndex) - .counter(TableMaintenanceMetrics.ERROR_COUNTER); - this.configuration = new Configuration(); - table.properties().forEach(configuration::set); - } - - @Override - public void processElement(Trigger trigger, Context ctx, Collector out) throws Exception { - long olderThanTimestamp = trigger.timestamp() - minAgeMs; - try { - if (usePrefixListing) { - Predicate predicate = fileInfo -> fileInfo.createdAtMillis() < olderThanTimestamp; - Preconditions.checkArgument( - io instanceof SupportsPrefixOperations, - "Cannot use prefix listing with FileIO {} which does not support prefix operations.", - io); - - FileSystemWalker.listDirRecursivelyWithFileIO( - (SupportsPrefixOperations) io, location, specs, predicate, out::collect); - } else { - Predicate predicate = file -> file.getModificationTime() < olderThanTimestamp; - FileSystemWalker.listDirRecursivelyWithHadoop( - location, - specs, - predicate, - configuration, - Integer.MAX_VALUE, - Integer.MAX_VALUE, - dir -> {}, - out::collect); - } - } catch (Exception e) { - LOG.warn("Exception listing files for {} at {}", location, ctx.timestamp(), e); - ctx.output(DeleteOrphanFiles.ERROR_STREAM, e); - errorCounter.inc(); - } - } - - @Override - public void close() throws Exception { - super.close(); - tableLoader.close(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListMetadataFiles.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListMetadataFiles.java deleted file mode 100644 index 3ae42c60831c..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/ListMetadataFiles.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.functions.OpenContext; -import org.apache.flink.metrics.Counter; -import org.apache.flink.streaming.api.functions.ProcessFunction; -import org.apache.flink.util.Collector; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ReachableFileUtil; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** Lists the metadata files referenced by the table. */ -@Internal -public class ListMetadataFiles extends ProcessFunction { - private static final Logger LOG = LoggerFactory.getLogger(ListMetadataFiles.class); - - private final String taskName; - private final int taskIndex; - private transient Counter errorCounter; - private final TableLoader tableLoader; - private transient Table table; - - public ListMetadataFiles(String taskName, int taskIndex, TableLoader tableLoader) { - Preconditions.checkNotNull(taskName, "Task name should no be null"); - Preconditions.checkNotNull(tableLoader, "TableLoader should no be null"); - this.tableLoader = tableLoader; - this.taskName = taskName; - this.taskIndex = taskIndex; - } - - @Override - public void open(OpenContext openContext) throws Exception { - super.open(openContext); - tableLoader.open(); - this.table = tableLoader.loadTable(); - this.errorCounter = - TableMaintenanceMetrics.groupFor(getRuntimeContext(), table.name(), taskName, taskIndex) - .counter(TableMaintenanceMetrics.ERROR_COUNTER); - } - - @Override - public void processElement(Trigger trigger, Context ctx, Collector collector) - throws Exception { - try { - table - .snapshots() - .forEach( - snapshot -> { - // Manifest lists - collector.collect(snapshot.manifestListLocation()); - // Snapshot JSONs - ReachableFileUtil.metadataFileLocations(table, false).forEach(collector::collect); - // Statistics files - ReachableFileUtil.statisticsFilesLocations(table).forEach(collector::collect); - // Version hint file for Hadoop catalogs - collector.collect(ReachableFileUtil.versionHintLocation(table)); - - // Emit the manifest file locations - snapshot.allManifests(table.io()).stream() - .map(ManifestFile::path) - .forEach(collector::collect); - }); - } catch (Exception e) { - LOG.error("Exception listing metadata files for {} at {}", table, ctx.timestamp(), e); - ctx.output(DeleteOrphanFiles.ERROR_STREAM, e); - errorCounter.inc(); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockFactoryBuilder.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockFactoryBuilder.java deleted file mode 100644 index ea91f13376a5..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockFactoryBuilder.java +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import java.util.Map; -import org.apache.commons.lang3.StringUtils; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.flink.maintenance.api.JdbcLockFactory; -import org.apache.iceberg.flink.maintenance.api.LockConfig; -import org.apache.iceberg.flink.maintenance.api.TriggerLockFactory; -import org.apache.iceberg.flink.maintenance.api.ZkLockFactory; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -@Internal -public class LockFactoryBuilder { - - private LockFactoryBuilder() {} - - public static TriggerLockFactory build(LockConfig lockConfig, String tableName) { - - String lockType = lockConfig.lockType(); - - Preconditions.checkArgument( - StringUtils.isNotEmpty(lockType), - "Configuration must contain key: %s", - LockConfig.LOCK_TYPE_OPTION.key()); - - // Set lock id to catalog.db.table if not set - switch (lockType) { - case LockConfig.JdbcLockConfig.JDBC: - return createJdbcLockFactory(lockConfig, tableName); - - case LockConfig.ZkLockConfig.ZK: - return createZkLockFactory(lockConfig, tableName); - - default: - throw new IllegalArgumentException(String.format("Unsupported lock type: %s ", lockType)); - } - } - - private static TriggerLockFactory createJdbcLockFactory(LockConfig lockConfig, String tableName) { - String jdbcUri = lockConfig.jdbcUri(); - String lockId = lockConfig.lockId(tableName); - Map properties = lockConfig.properties(); - Preconditions.checkArgument( - StringUtils.isNotEmpty(jdbcUri), - "JDBC lock requires %s parameter", - LockConfig.JdbcLockConfig.JDBC_URI_OPTION.key()); - - properties.put(JdbcLockFactory.INIT_LOCK_TABLES_PROPERTY, lockConfig.jdbcInitTable()); - - return new JdbcLockFactory(jdbcUri, lockId, properties); - } - - private static TriggerLockFactory createZkLockFactory(LockConfig lockConfig, String tableName) { - String zkUri = lockConfig.zkUri(); - String lockId = lockConfig.lockId(tableName); - Preconditions.checkArgument( - StringUtils.isNotEmpty(zkUri), - "Zk lock requires %s parameter", - LockConfig.ZkLockConfig.ZK_URI_OPTION.key()); - - return new ZkLockFactory( - zkUri, - lockId, - lockConfig.zkSessionTimeoutMs(), - lockConfig.zkConnectionTimeoutMs(), - lockConfig.zkBaseSleepMs(), - lockConfig.zkMaxRetries()); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockRemover.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockRemover.java deleted file mode 100644 index 2066ca8e010e..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LockRemover.java +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import java.util.List; -import java.util.concurrent.atomic.AtomicLong; -import org.apache.flink.annotation.Internal; -import org.apache.flink.metrics.Counter; -import org.apache.flink.metrics.MetricGroup; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.iceberg.flink.maintenance.api.TaskResult; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.apache.iceberg.flink.maintenance.api.TriggerLockFactory; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Manages locks and collect {@link org.apache.flink.metrics.Metric} for the Maintenance Tasks. - * - *

    The assumptions about the locks are the following: - * - *

      - *
    • Every {@link TaskResult} is followed by a {@link Watermark} for normal {@link Trigger}s - *
    • For the {@link Trigger#recovery(long)} {@link Watermark} there is no element to process - *
    - * - * When processing the inputs there are 3 possibilities: - * - *
      - *
    • Normal execution - we receive a {@link TaskResult} and then a {@link Watermark} - unlocking - * the lock is handled by the {@link #processElement(StreamRecord)} - *
    • Recovery without ongoing execution (unlocking the recoveryLock) - we receive the {@link - * Trigger#recovery(long)} {@link Watermark} without any {@link TaskResult} - unlocking the - * {@link TriggerLockFactory#createRecoveryLock()} and a possible {@link - * TriggerLockFactory#createLock()} is handled by the {@link #processWatermark(Watermark)} - * (the {@link #lastProcessedTaskStartEpoch} is 0 in this case) - *
    • Recovery with an ongoing execution - we receive a {@link TaskResult} and then a {@link - * Watermark} - unlocking the {@link TriggerLockFactory#createLock()} is handled by the {@link - * #processElement(StreamRecord)}, unlocking the {@link - * TriggerLockFactory#createRecoveryLock()} is handled by the {@link - * #processWatermark(Watermark)} (the {@link #lastProcessedTaskStartEpoch} is the start time - * of the old task) - *
    - */ -@Internal -public class LockRemover extends AbstractStreamOperator - implements OneInputStreamOperator { - private static final Logger LOG = LoggerFactory.getLogger(LockRemover.class); - - private final String tableName; - private final TriggerLockFactory lockFactory; - private final List maintenanceTaskNames; - - private transient List succeededTaskResultCounters; - private transient List failedTaskResultCounters; - private transient List taskLastRunDurationMs; - private transient TriggerLockFactory.Lock lock; - private transient TriggerLockFactory.Lock recoveryLock; - private transient long lastProcessedTaskStartEpoch = 0L; - - public LockRemover( - String tableName, TriggerLockFactory lockFactory, List maintenanceTaskNames) { - Preconditions.checkNotNull(lockFactory, "Lock factory should no be null"); - Preconditions.checkArgument( - maintenanceTaskNames != null && !maintenanceTaskNames.isEmpty(), - "Invalid maintenance task names: null or empty"); - - this.tableName = tableName; - this.lockFactory = lockFactory; - this.maintenanceTaskNames = maintenanceTaskNames; - } - - @Override - public void open() throws Exception { - super.open(); - this.succeededTaskResultCounters = - Lists.newArrayListWithExpectedSize(maintenanceTaskNames.size()); - this.failedTaskResultCounters = Lists.newArrayListWithExpectedSize(maintenanceTaskNames.size()); - this.taskLastRunDurationMs = Lists.newArrayListWithExpectedSize(maintenanceTaskNames.size()); - for (int taskIndex = 0; taskIndex < maintenanceTaskNames.size(); ++taskIndex) { - MetricGroup taskMetricGroup = - TableMaintenanceMetrics.groupFor( - getRuntimeContext(), tableName, maintenanceTaskNames.get(taskIndex), taskIndex); - succeededTaskResultCounters.add( - taskMetricGroup.counter(TableMaintenanceMetrics.SUCCEEDED_TASK_COUNTER)); - failedTaskResultCounters.add( - taskMetricGroup.counter(TableMaintenanceMetrics.FAILED_TASK_COUNTER)); - AtomicLong duration = new AtomicLong(0); - taskLastRunDurationMs.add(duration); - taskMetricGroup.gauge(TableMaintenanceMetrics.LAST_RUN_DURATION_MS, duration::get); - } - - lockFactory.open(); - this.lock = lockFactory.createLock(); - this.recoveryLock = lockFactory.createRecoveryLock(); - } - - @Override - public void processElement(StreamRecord streamRecord) { - TaskResult taskResult = streamRecord.getValue(); - LOG.info( - "Processing result {} for task {}", - taskResult, - maintenanceTaskNames.get(taskResult.taskIndex())); - long duration = System.currentTimeMillis() - taskResult.startEpoch(); - lock.unlock(); - this.lastProcessedTaskStartEpoch = taskResult.startEpoch(); - - // Update the metrics - taskLastRunDurationMs.get(taskResult.taskIndex()).set(duration); - if (taskResult.success()) { - succeededTaskResultCounters.get(taskResult.taskIndex()).inc(); - } else { - failedTaskResultCounters.get(taskResult.taskIndex()).inc(); - } - } - - @Override - public void processWatermark(Watermark mark) { - if (mark.getTimestamp() > lastProcessedTaskStartEpoch) { - lock.unlock(); - recoveryLock.unlock(); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LogUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LogUtil.java deleted file mode 100644 index 8bdcd7ba2b57..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/LogUtil.java +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -class LogUtil { - static final String MESSAGE_PREFIX = "[For table {} with {}[{}] at {}]: "; - static final String MESSAGE_FORMAT_PREFIX = "[For table %s with {%s}[{%d}] at {%d}]: "; - - private LogUtil() {} -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MetadataTablePlanner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MetadataTablePlanner.java deleted file mode 100644 index c03b3be1a977..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MetadataTablePlanner.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import java.util.concurrent.ExecutorService; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.functions.OpenContext; -import org.apache.flink.metrics.Counter; -import org.apache.flink.streaming.api.functions.ProcessFunction; -import org.apache.flink.util.Collector; -import org.apache.iceberg.MetadataTableType; -import org.apache.iceberg.MetadataTableUtils; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.apache.iceberg.flink.source.FlinkSplitPlanner; -import org.apache.iceberg.flink.source.ScanContext; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitSerializer; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.util.ThreadPools; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** Plans the splits to read a metadata table content. */ -@Internal -public class MetadataTablePlanner extends ProcessFunction { - private static final Logger LOG = LoggerFactory.getLogger(MetadataTablePlanner.class); - - private final String taskName; - private final int taskIndex; - private final TableLoader tableLoader; - private final int workerPoolSize; - private final ScanContext scanContext; - private transient ExecutorService workerPool; - private transient Counter errorCounter; - private transient Table table; - private transient IcebergSourceSplitSerializer splitSerializer; - private final MetadataTableType metadataTableType; - - public MetadataTablePlanner( - String taskName, - int taskIndex, - TableLoader tableLoader, - ScanContext scanContext, - MetadataTableType metadataTableType, - int workerPoolSize) { - Preconditions.checkNotNull(taskName, "Task name should no be null"); - Preconditions.checkNotNull(tableLoader, "Table should no be null"); - Preconditions.checkArgument(scanContext.isStreaming(), "Streaming should be set to true"); - - this.taskName = taskName; - this.taskIndex = taskIndex; - this.tableLoader = tableLoader; - this.scanContext = scanContext; - this.workerPoolSize = workerPoolSize; - this.metadataTableType = metadataTableType; - } - - @Override - public void open(OpenContext openContext) throws Exception { - tableLoader.open(); - Table originalTable = tableLoader.loadTable(); - this.table = MetadataTableUtils.createMetadataTableInstance(originalTable, metadataTableType); - this.workerPool = - ThreadPools.newFixedThreadPool(table.name() + "-table-planner", workerPoolSize); - this.splitSerializer = new IcebergSourceSplitSerializer(scanContext.caseSensitive()); - this.errorCounter = - TableMaintenanceMetrics.groupFor( - getRuntimeContext(), originalTable.name(), taskName, taskIndex) - .counter(TableMaintenanceMetrics.ERROR_COUNTER); - } - - @Override - public void processElement(Trigger trigger, Context ctx, Collector out) - throws Exception { - try { - table.refresh(); - for (IcebergSourceSplit split : - FlinkSplitPlanner.planIcebergSourceSplits(table, scanContext, workerPool)) { - out.collect(new SplitInfo(splitSerializer.getVersion(), splitSerializer.serialize(split))); - } - } catch (Exception e) { - LOG.warn("Exception planning scan for {} at {}", table, ctx.timestamp(), e); - ctx.output(DeleteOrphanFiles.ERROR_STREAM, e); - errorCounter.inc(); - } - } - - @Override - public void close() throws Exception { - super.close(); - tableLoader.close(); - if (workerPool != null) { - workerPool.shutdown(); - } - } - - public static class SplitInfo { - private final int version; - private final byte[] split; - - public SplitInfo(int version, byte[] split) { - this.version = version; - this.split = split; - } - - public int version() { - return version; - } - - public byte[] split() { - return split; - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java deleted file mode 100644 index d74b2349b1de..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import java.io.IOException; -import java.util.Iterator; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.connector.source.Boundedness; -import org.apache.flink.api.connector.source.SourceReader; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.api.connector.source.util.ratelimit.RateLimitedSourceReader; -import org.apache.flink.api.connector.source.util.ratelimit.RateLimiter; -import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.iceberg.DataOperations; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** Monitors an Iceberg table for changes */ -@Internal -public class MonitorSource extends SingleThreadedIteratorSource { - private static final Logger LOG = LoggerFactory.getLogger(MonitorSource.class); - - private final TableLoader tableLoader; - private final RateLimiterStrategy rateLimiterStrategy; - private final long maxReadBack; - - /** - * Creates a {@link org.apache.flink.api.connector.source.Source} which monitors an Iceberg table - * for changes. - * - * @param tableLoader used for accessing the table - * @param rateLimiterStrategy limits the frequency the table is checked - * @param maxReadBack sets the number of snapshots read before stopping change collection - */ - public MonitorSource( - TableLoader tableLoader, RateLimiterStrategy rateLimiterStrategy, long maxReadBack) { - Preconditions.checkNotNull(tableLoader, "Table loader should no be null"); - Preconditions.checkNotNull(rateLimiterStrategy, "Rate limiter strategy should no be null"); - Preconditions.checkArgument(maxReadBack > 0, "Need to read at least 1 snapshot to work"); - - this.tableLoader = tableLoader; - this.rateLimiterStrategy = rateLimiterStrategy; - this.maxReadBack = maxReadBack; - } - - @Override - public Boundedness getBoundedness() { - return Boundedness.CONTINUOUS_UNBOUNDED; - } - - @Override - public TypeInformation getProducedType() { - return TypeInformation.of(TableChange.class); - } - - @Override - Iterator createIterator() { - return new TableChangeIterator(tableLoader, null, maxReadBack); - } - - @Override - SimpleVersionedSerializer> iteratorSerializer() { - return new TableChangeIteratorSerializer(tableLoader, maxReadBack); - } - - @Override - public SourceReader> createReader( - SourceReaderContext readerContext) throws Exception { - RateLimiter rateLimiter = rateLimiterStrategy.createRateLimiter(1); - return new RateLimitedSourceReader<>(super.createReader(readerContext), rateLimiter); - } - - /** The Iterator which returns the latest changes on an Iceberg table. */ - @VisibleForTesting - static class TableChangeIterator implements Iterator { - private Long lastSnapshotId; - private final long maxReadBack; - private final Table table; - - TableChangeIterator(TableLoader tableLoader, Long lastSnapshotId, long maxReadBack) { - this.lastSnapshotId = lastSnapshotId; - this.maxReadBack = maxReadBack; - tableLoader.open(); - this.table = tableLoader.loadTable(); - } - - @Override - public boolean hasNext() { - return true; - } - - @Override - public TableChange next() { - try { - table.refresh(); - Snapshot currentSnapshot = table.currentSnapshot(); - Long current = currentSnapshot != null ? currentSnapshot.snapshotId() : null; - Long checking = current; - TableChange event = TableChange.empty(); - long readBack = 0; - while (checking != null && !checking.equals(lastSnapshotId) && ++readBack <= maxReadBack) { - Snapshot snapshot = table.snapshot(checking); - if (snapshot != null) { - if (!DataOperations.REPLACE.equals(snapshot.operation())) { - LOG.debug("Reading snapshot {}", snapshot.snapshotId()); - event.merge(new TableChange(snapshot, table.io())); - } else { - LOG.debug("Skipping replace snapshot {}", snapshot.snapshotId()); - } - - checking = snapshot.parentId(); - } else { - // If the last snapshot has been removed from the history - checking = null; - } - } - - lastSnapshotId = current; - return event; - } catch (Exception e) { - LOG.warn("Failed to fetch table changes for {}", table, e); - return TableChange.empty(); - } - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("lastSnapshotId", lastSnapshotId) - .add("maxReadBack", maxReadBack) - .add("table", table) - .toString(); - } - } - - private static final class TableChangeIteratorSerializer - implements SimpleVersionedSerializer> { - - private static final int CURRENT_VERSION = 1; - private final TableLoader tableLoader; - private final long maxReadBack; - - TableChangeIteratorSerializer(TableLoader tableLoader, long maxReadBack) { - this.tableLoader = tableLoader; - this.maxReadBack = maxReadBack; - } - - @Override - public int getVersion() { - return CURRENT_VERSION; - } - - @Override - public byte[] serialize(Iterator iterator) throws IOException { - Preconditions.checkArgument( - iterator instanceof TableChangeIterator, - "Use TableChangeIterator iterator. Found incompatible type: %s", - iterator.getClass()); - - TableChangeIterator tableChangeIterator = (TableChangeIterator) iterator; - DataOutputSerializer out = new DataOutputSerializer(8); - long toStore = - tableChangeIterator.lastSnapshotId != null ? tableChangeIterator.lastSnapshotId : -1L; - out.writeLong(toStore); - return out.getCopyOfBuffer(); - } - - @Override - public TableChangeIterator deserialize(int version, byte[] serialized) throws IOException { - if (version == CURRENT_VERSION) { - DataInputDeserializer in = new DataInputDeserializer(serialized); - long fromStore = in.readLong(); - return new TableChangeIterator( - tableLoader, fromStore != -1 ? fromStore : null, maxReadBack); - } else { - throw new IOException("Unrecognized version or corrupt state: " + version); - } - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/OrphanFilesDetector.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/OrphanFilesDetector.java deleted file mode 100644 index 5c602f4f1e54..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/OrphanFilesDetector.java +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import org.apache.commons.lang3.StringUtils; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.functions.OpenContext; -import org.apache.flink.api.common.state.MapState; -import org.apache.flink.api.common.state.MapStateDescriptor; -import org.apache.flink.api.common.state.ValueState; -import org.apache.flink.api.common.state.ValueStateDescriptor; -import org.apache.flink.api.common.typeinfo.Types; -import org.apache.flink.streaming.api.functions.co.KeyedCoProcessFunction; -import org.apache.flink.util.Collector; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.actions.DeleteOrphanFiles; -import org.apache.iceberg.actions.FileURI; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * A specialized co-process function that performs an anti-join between two streams of file URIs. - * - *

    Emits every file that exists in the file system but is not referenced in the table metadata, - * which are considered orphan files. It also handles URI normalization using provided scheme and - * authority equivalence mappings. - */ -@Internal -public class OrphanFilesDetector extends KeyedCoProcessFunction { - private static final Logger LOG = LoggerFactory.getLogger(OrphanFilesDetector.class); - - // Use MapState to dedupe the strings found in the table - private transient MapState foundInTable; - private transient ValueState foundInFileSystem; - private transient ValueState hasUriError; - private final DeleteOrphanFiles.PrefixMismatchMode prefixMismatchMode; - private final Map equalSchemes; - private final Map equalAuthorities; - - public OrphanFilesDetector( - DeleteOrphanFiles.PrefixMismatchMode prefixMismatchMode, - Map equalSchemes, - Map equalAuthorities) { - this.prefixMismatchMode = prefixMismatchMode; - this.equalSchemes = equalSchemes; - this.equalAuthorities = equalAuthorities; - } - - @Override - public void open(OpenContext openContext) throws Exception { - super.open(openContext); - foundInTable = - getRuntimeContext() - .getMapState( - new MapStateDescriptor<>("antiJoinFoundInTable", Types.STRING, Types.BOOLEAN)); - hasUriError = - getRuntimeContext().getState(new ValueStateDescriptor<>("antiJoinUriError", Types.BOOLEAN)); - foundInFileSystem = - getRuntimeContext() - .getState(new ValueStateDescriptor<>("antiJoinFoundInFileSystem", Types.STRING)); - } - - @Override - public void processElement1(String value, Context context, Collector collector) - throws Exception { - if (shouldSkipElement(value, context)) { - return; - } - - if (!foundInTable.contains(value)) { - foundInTable.put(value, true); - context.timerService().registerEventTimeTimer(context.timestamp()); - } - } - - @Override - public void processElement2(String value, Context context, Collector collector) - throws Exception { - if (shouldSkipElement(value, context)) { - return; - } - - foundInFileSystem.update(value); - context.timerService().registerEventTimeTimer(context.timestamp()); - } - - @Override - public void onTimer(long timestamp, OnTimerContext ctx, Collector out) throws Exception { - if (Boolean.TRUE.equals(hasUriError.value())) { - clearState(); - return; - } - - List foundInTablesList = Lists.newArrayList(); - foundInTable - .keys() - .forEach( - uri -> - foundInTablesList.add( - new FileURI(new Path(uri).toUri(), equalSchemes, equalAuthorities))); - - if (foundInFileSystem.value() != null) { - if (foundInTablesList.isEmpty()) { - FileURI fileURI = - new FileURI( - new Path(foundInFileSystem.value()).toUri(), equalSchemes, equalAuthorities); - out.collect(fileURI.getUriAsString()); - } else { - FileURI actual = - new FileURI( - new Path(foundInFileSystem.value()).toUri(), equalSchemes, equalAuthorities); - if (hasMismatch(actual, foundInTablesList)) { - if (prefixMismatchMode == DeleteOrphanFiles.PrefixMismatchMode.DELETE) { - out.collect(foundInFileSystem.value()); - } else if (prefixMismatchMode == DeleteOrphanFiles.PrefixMismatchMode.ERROR) { - ValidationException validationException = - new ValidationException( - "Unable to determine whether certain files are orphan. " - + "Metadata references files that match listed/provided files except for authority/scheme. " - + "Please, inspect the conflicting authorities/schemes and provide which of them are equal " - + "by further configuring the action via equalSchemes() and equalAuthorities() methods. " - + "Set the prefix mismatch mode to 'NONE' to ignore remaining locations with conflicting " - + "authorities/schemes or to 'DELETE' if you are ABSOLUTELY confident that remaining conflicting " - + "authorities/schemes are different. It will be impossible to recover deleted files. " - + "Conflicting authorities/schemes"); - LOG.warn( - "Unable to determine whether certain files are orphan. Found in filesystem: {} and in table: {}", - actual, - StringUtils.join(foundInTablesList, ","), - validationException); - ctx.output( - org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles.ERROR_STREAM, - validationException); - } - } - } - } - - clearState(); - } - - private boolean hasMismatch(FileURI actual, List foundInTablesList) { - return foundInTablesList.stream() - .noneMatch(valid -> valid.schemeMatch(actual) && valid.authorityMatch(actual)); - } - - private boolean shouldSkipElement(String value, Context context) throws IOException { - if (Boolean.TRUE.equals(hasUriError.value())) { - return true; - } - - if (FileUriKeySelector.INVALID_URI.equals(context.getCurrentKey())) { - context.output( - org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles.ERROR_STREAM, - new RuntimeException("Invalid URI format detected: " + value)); - hasUriError.update(true); - foundInTable.clear(); - foundInFileSystem.clear(); - return true; - } - - return false; - } - - private void clearState() { - hasUriError.clear(); - foundInTable.clear(); - foundInFileSystem.clear(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java deleted file mode 100644 index 20c7684d9700..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import java.io.IOException; -import java.util.Collection; -import java.util.Iterator; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.connector.source.Source; -import org.apache.flink.api.connector.source.SourceReader; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.api.connector.source.SplitEnumerator; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.flink.api.connector.source.lib.util.IteratorSourceEnumerator; -import org.apache.flink.api.connector.source.lib.util.IteratorSourceReader; -import org.apache.flink.api.connector.source.lib.util.IteratorSourceSplit; -import org.apache.flink.api.java.typeutils.ResultTypeQueryable; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -/** - * Implementation of the Source V2 API which uses an iterator to read the elements, and uses a - * single thread to do so. - * - * @param The return type of the source - */ -@Internal -public abstract class SingleThreadedIteratorSource - implements Source< - T, - SingleThreadedIteratorSource.GlobalSplit, - Collection>>, - ResultTypeQueryable { - private static final String PARALLELISM_ERROR = "Parallelism should be set to 1"; - - /** - * Creates the iterator to return the elements which then emitted by the source. - * - * @return iterator for the elements - */ - abstract Iterator createIterator(); - - /** - * Serializes the iterator, which is used to save and restore the state of the source. - * - * @return serializer for the iterator - */ - abstract SimpleVersionedSerializer> iteratorSerializer(); - - @Override - public SplitEnumerator, Collection>> createEnumerator( - SplitEnumeratorContext> enumContext) { - Preconditions.checkArgument(enumContext.currentParallelism() == 1, PARALLELISM_ERROR); - return new IteratorSourceEnumerator<>( - enumContext, ImmutableList.of(new GlobalSplit<>(createIterator()))); - } - - @Override - public SplitEnumerator, Collection>> restoreEnumerator( - SplitEnumeratorContext> enumContext, Collection> checkpoint) { - Preconditions.checkArgument(enumContext.currentParallelism() == 1, PARALLELISM_ERROR); - return new IteratorSourceEnumerator<>(enumContext, checkpoint); - } - - @Override - public SimpleVersionedSerializer> getSplitSerializer() { - return new SplitSerializer<>(iteratorSerializer()); - } - - @Override - public SimpleVersionedSerializer>> getEnumeratorCheckpointSerializer() { - return new EnumeratorSerializer<>(iteratorSerializer()); - } - - @Override - public SourceReader> createReader(SourceReaderContext readerContext) - throws Exception { - Preconditions.checkArgument(readerContext.getIndexOfSubtask() == 0, PARALLELISM_ERROR); - return new IteratorSourceReader<>(readerContext); - } - - /** The single split of the {@link SingleThreadedIteratorSource}. */ - static class GlobalSplit implements IteratorSourceSplit> { - private final Iterator iterator; - - GlobalSplit(Iterator iterator) { - this.iterator = iterator; - } - - @Override - public String splitId() { - return "1"; - } - - @Override - public Iterator getIterator() { - return iterator; - } - - @Override - public IteratorSourceSplit> getUpdatedSplitForIterator( - final Iterator newIterator) { - return new GlobalSplit<>(newIterator); - } - - @Override - public String toString() { - return String.format("GlobalSplit (%s)", iterator); - } - } - - private static final class SplitSerializer - implements SimpleVersionedSerializer> { - private final SimpleVersionedSerializer> iteratorSerializer; - - SplitSerializer(SimpleVersionedSerializer> iteratorSerializer) { - this.iteratorSerializer = iteratorSerializer; - } - - private static final int CURRENT_VERSION = 1; - - @Override - public int getVersion() { - return CURRENT_VERSION; - } - - @Override - public byte[] serialize(GlobalSplit split) throws IOException { - return iteratorSerializer.serialize(split.iterator); - } - - @Override - public GlobalSplit deserialize(int version, byte[] serialized) throws IOException { - return new GlobalSplit<>(iteratorSerializer.deserialize(version, serialized)); - } - } - - private static final class EnumeratorSerializer - implements SimpleVersionedSerializer>> { - private static final int CURRENT_VERSION = 1; - private final SimpleVersionedSerializer> iteratorSerializer; - - EnumeratorSerializer(SimpleVersionedSerializer> iteratorSerializer) { - this.iteratorSerializer = iteratorSerializer; - } - - @Override - public int getVersion() { - return CURRENT_VERSION; - } - - @Override - public byte[] serialize(Collection> checkpoint) throws IOException { - Preconditions.checkArgument(checkpoint.size() < 2, PARALLELISM_ERROR); - if (checkpoint.isEmpty()) { - return new byte[] {0}; - } else { - byte[] iterator = iteratorSerializer.serialize(checkpoint.iterator().next().getIterator()); - byte[] result = new byte[iterator.length + 1]; - result[0] = 1; - System.arraycopy(iterator, 0, result, 1, iterator.length); - return result; - } - } - - @Override - public Collection> deserialize(int version, byte[] serialized) - throws IOException { - if (serialized[0] == 0) { - return Lists.newArrayList(); - } else { - byte[] iterator = new byte[serialized.length - 1]; - System.arraycopy(serialized, 1, iterator, 0, serialized.length - 1); - return Lists.newArrayList( - new GlobalSplit<>(iteratorSerializer.deserialize(version, iterator))); - } - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SkipOnError.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SkipOnError.java deleted file mode 100644 index 8a185ba8a912..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SkipOnError.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.state.ListState; -import org.apache.flink.api.common.state.ListStateDescriptor; -import org.apache.flink.api.common.typeinfo.Types; -import org.apache.flink.runtime.state.StateInitializationContext; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.TwoInputStreamOperator; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** Skip file deletion processing when an error is encountered. */ -@Internal -public class SkipOnError extends AbstractStreamOperator - implements TwoInputStreamOperator { - private static final Logger LOG = LoggerFactory.getLogger(SkipOnError.class); - private transient ListState filesToDelete; - private transient ListState hasError; - private boolean hasErrorFlag = false; - - @Override - public void initializeState(StateInitializationContext context) throws Exception { - super.initializeState(context); - this.filesToDelete = - context - .getOperatorStateStore() - .getListState(new ListStateDescriptor<>("blockOnErrorFiles", String.class)); - this.hasError = - context - .getOperatorStateStore() - .getListState(new ListStateDescriptor<>("blockOnErrorHasError", Types.BOOLEAN)); - - if (!Iterables.isEmpty(hasError.get())) { - hasErrorFlag = true; - } - } - - @Override - public void processElement1(StreamRecord element) throws Exception { - if (!hasErrorFlag) { - filesToDelete.add(element.getValue()); - } - } - - @Override - public void processElement2(StreamRecord element) throws Exception { - hasError.add(true); - hasErrorFlag = true; - filesToDelete.clear(); - } - - @Override - public void processWatermark(Watermark mark) throws Exception { - try { - if (!hasErrorFlag) { - filesToDelete.get().forEach(file -> output.collect(new StreamRecord<>(file))); - } else { - LOG.info("Omitting result on failure at {}", mark.getTimestamp()); - } - } finally { - filesToDelete.clear(); - hasError.clear(); - hasErrorFlag = false; - } - - super.processWatermark(mark); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java deleted file mode 100644 index 87600c52304a..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java +++ /dev/null @@ -1,244 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import java.util.Objects; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; - -/** Event describing changes in an Iceberg table */ -@Internal -public class TableChange { - private int dataFileCount; - private long dataFileSizeInBytes; - private int posDeleteFileCount; - private long posDeleteRecordCount; - private int eqDeleteFileCount; - private long eqDeleteRecordCount; - private int commitCount; - - private TableChange( - int dataFileCount, - long dataFileSizeInBytes, - int posDeleteFileCount, - long posDeleteRecordCount, - int eqDeleteFileCount, - long eqDeleteRecordCount, - int commitCount) { - this.dataFileCount = dataFileCount; - this.dataFileSizeInBytes = dataFileSizeInBytes; - this.posDeleteFileCount = posDeleteFileCount; - this.posDeleteRecordCount = posDeleteRecordCount; - this.eqDeleteFileCount = eqDeleteFileCount; - this.eqDeleteRecordCount = eqDeleteRecordCount; - this.commitCount = commitCount; - } - - TableChange(Snapshot snapshot, FileIO io) { - this(snapshot.addedDataFiles(io), snapshot.addedDeleteFiles(io)); - } - - public TableChange(Iterable dataFiles, Iterable deleteFiles) { - dataFiles.forEach( - dataFile -> { - this.dataFileCount++; - this.dataFileSizeInBytes += dataFile.fileSizeInBytes(); - }); - - deleteFiles.forEach( - deleteFile -> { - switch (deleteFile.content()) { - case POSITION_DELETES: - this.posDeleteFileCount++; - this.posDeleteRecordCount += deleteFile.recordCount(); - break; - case EQUALITY_DELETES: - this.eqDeleteFileCount++; - this.eqDeleteRecordCount += deleteFile.recordCount(); - break; - default: - throw new IllegalArgumentException("Unexpected delete file content: " + deleteFile); - } - }); - - this.commitCount = 1; - } - - static TableChange empty() { - return new TableChange(0, 0L, 0, 0L, 0, 0L, 0); - } - - public static Builder builder() { - return new Builder(); - } - - int dataFileCount() { - return dataFileCount; - } - - long dataFileSizeInBytes() { - return dataFileSizeInBytes; - } - - int posDeleteFileCount() { - return posDeleteFileCount; - } - - long posDeleteRecordCount() { - return posDeleteRecordCount; - } - - int eqDeleteFileCount() { - return eqDeleteFileCount; - } - - long eqDeleteRecordCount() { - return eqDeleteRecordCount; - } - - int commitCount() { - return commitCount; - } - - public void merge(TableChange other) { - this.dataFileCount += other.dataFileCount; - this.dataFileSizeInBytes += other.dataFileSizeInBytes; - this.posDeleteFileCount += other.posDeleteFileCount; - this.posDeleteRecordCount += other.posDeleteRecordCount; - this.eqDeleteFileCount += other.eqDeleteFileCount; - this.eqDeleteRecordCount += other.eqDeleteRecordCount; - this.commitCount += other.commitCount; - } - - TableChange copy() { - return new TableChange( - dataFileCount, - dataFileSizeInBytes, - posDeleteFileCount, - posDeleteRecordCount, - eqDeleteFileCount, - eqDeleteRecordCount, - commitCount); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("dataFileCount", dataFileCount) - .add("dataFileSizeInBytes", dataFileSizeInBytes) - .add("posDeleteFileCount", posDeleteFileCount) - .add("posDeleteRecordCount", posDeleteRecordCount) - .add("eqDeleteFileCount", eqDeleteFileCount) - .add("eqDeleteRecordCount", eqDeleteRecordCount) - .add("commitCount", commitCount) - .toString(); - } - - @Override - public boolean equals(Object other) { - if (this == other) { - return true; - } else if (other == null || getClass() != other.getClass()) { - return false; - } - - TableChange that = (TableChange) other; - return dataFileCount == that.dataFileCount - && dataFileSizeInBytes == that.dataFileSizeInBytes - && posDeleteFileCount == that.posDeleteFileCount - && posDeleteRecordCount == that.posDeleteRecordCount - && eqDeleteFileCount == that.eqDeleteFileCount - && eqDeleteRecordCount == that.eqDeleteRecordCount - && commitCount == that.commitCount; - } - - @Override - public int hashCode() { - return Objects.hash( - dataFileCount, - dataFileSizeInBytes, - posDeleteFileCount, - posDeleteRecordCount, - eqDeleteFileCount, - eqDeleteRecordCount, - commitCount); - } - - public static class Builder { - private int dataFileCount = 0; - private long dataFileSizeInBytes = 0L; - private int posDeleteFileCount = 0; - private long posDeleteRecordCount = 0L; - private int eqDeleteFileCount = 0; - private long eqDeleteRecordCount = 0L; - private int commitCount = 0; - - private Builder() {} - - public Builder dataFileCount(int newDataFileCount) { - this.dataFileCount = newDataFileCount; - return this; - } - - public Builder dataFileSizeInBytes(long newDataFileSizeInBytes) { - this.dataFileSizeInBytes = newDataFileSizeInBytes; - return this; - } - - public Builder posDeleteFileCount(int newPosDeleteFileCount) { - this.posDeleteFileCount = newPosDeleteFileCount; - return this; - } - - public Builder posDeleteRecordCount(long newPosDeleteRecordCount) { - this.posDeleteRecordCount = newPosDeleteRecordCount; - return this; - } - - public Builder eqDeleteFileCount(int newEqDeleteFileCount) { - this.eqDeleteFileCount = newEqDeleteFileCount; - return this; - } - - public Builder eqDeleteRecordCount(long newEqDeleteRecordCount) { - this.eqDeleteRecordCount = newEqDeleteRecordCount; - return this; - } - - public Builder commitCount(int newCommitCount) { - this.commitCount = newCommitCount; - return this; - } - - public TableChange build() { - return new TableChange( - dataFileCount, - dataFileSizeInBytes, - posDeleteFileCount, - posDeleteRecordCount, - eqDeleteFileCount, - eqDeleteRecordCount, - commitCount); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableMaintenanceMetrics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableMaintenanceMetrics.java deleted file mode 100644 index 897760caaacc..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableMaintenanceMetrics.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import org.apache.flink.api.common.functions.RuntimeContext; -import org.apache.flink.metrics.MetricGroup; - -public class TableMaintenanceMetrics { - public static final String GROUP_KEY = "maintenance"; - public static final String TASK_NAME_KEY = "taskName"; - public static final String TASK_INDEX_KEY = "taskIndex"; - public static final String TABLE_NAME_KEY = "tableName"; - - // Operator error counter - public static final String ERROR_COUNTER = "error"; - - // TriggerManager metrics - public static final String RATE_LIMITER_TRIGGERED = "rateLimiterTriggered"; - public static final String CONCURRENT_RUN_THROTTLED = "concurrentRunThrottled"; - public static final String TRIGGERED = "triggered"; - public static final String NOTHING_TO_TRIGGER = "nothingToTrigger"; - - // LockRemover metrics - public static final String SUCCEEDED_TASK_COUNTER = "succeededTasks"; - public static final String FAILED_TASK_COUNTER = "failedTasks"; - public static final String LAST_RUN_DURATION_MS = "lastRunDurationMs"; - - // DeleteFiles metrics - public static final String DELETE_FILE_FAILED_COUNTER = "deleteFailed"; - public static final String DELETE_FILE_SUCCEEDED_COUNTER = "deleteSucceeded"; - - // DataFileUpdater metrics - public static final String ADDED_DATA_FILE_NUM_METRIC = "addedDataFileNum"; - public static final String ADDED_DATA_FILE_SIZE_METRIC = "addedDataFileSize"; - public static final String REMOVED_DATA_FILE_NUM_METRIC = "removedDataFileNum"; - public static final String REMOVED_DATA_FILE_SIZE_METRIC = "removedDataFileSize"; - - static MetricGroup groupFor( - RuntimeContext context, String tableName, String taskName, int taskIndex) { - return groupFor(groupFor(context, tableName), taskName, taskIndex); - } - - static MetricGroup groupFor(RuntimeContext context, String tableName) { - return context - .getMetricGroup() - .addGroup(TableMaintenanceMetrics.GROUP_KEY) - .addGroup(TableMaintenanceMetrics.TABLE_NAME_KEY, tableName); - } - - static MetricGroup groupFor(MetricGroup mainGroup, String taskName, int taskIndex) { - return mainGroup - .addGroup(TableMaintenanceMetrics.TASK_NAME_KEY, taskName) - .addGroup(TableMaintenanceMetrics.TASK_INDEX_KEY, String.valueOf(taskIndex)); - } - - private TableMaintenanceMetrics() { - // do not instantiate - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableReader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableReader.java deleted file mode 100644 index 0b6b09b8902a..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableReader.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import org.apache.flink.api.common.functions.OpenContext; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.metrics.Counter; -import org.apache.flink.streaming.api.functions.ProcessFunction; -import org.apache.flink.table.data.RowData; -import org.apache.flink.util.Collector; -import org.apache.iceberg.MetadataTableType; -import org.apache.iceberg.MetadataTableUtils; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.flink.source.ScanContext; -import org.apache.iceberg.flink.source.reader.MetaDataReaderFunction; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitSerializer; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** Reads the records from the metadata table splits. */ -abstract class TableReader extends ProcessFunction { - private static final Logger LOG = LoggerFactory.getLogger(TableReader.class); - - private final TableLoader tableLoader; - private final String taskName; - private final int taskIndex; - private final Schema projectedSchema; - private IcebergSourceSplitSerializer splitSerializer; - private final ScanContext scanContext; - private final MetadataTableType metadataTableType; - - private transient MetaDataReaderFunction rowDataReaderFunction; - private transient Counter errorCounter; - - TableReader( - String taskName, - int taskIndex, - TableLoader tableLoader, - Schema projectedSchema, - ScanContext scanContext, - MetadataTableType metadataTableType) { - Preconditions.checkNotNull(taskName, "Task name should no be null"); - Preconditions.checkNotNull(tableLoader, "Table should no be null"); - Preconditions.checkNotNull(projectedSchema, "The projected schema should no be null"); - - this.tableLoader = tableLoader; - this.taskName = taskName; - this.taskIndex = taskIndex; - this.projectedSchema = projectedSchema; - this.scanContext = scanContext; - this.metadataTableType = metadataTableType; - } - - @Override - public void open(OpenContext openContext) throws Exception { - tableLoader.open(); - Table table = tableLoader.loadTable(); - Table metaTable = MetadataTableUtils.createMetadataTableInstance(table, metadataTableType); - this.errorCounter = - TableMaintenanceMetrics.groupFor(getRuntimeContext(), table.name(), taskName, taskIndex) - .counter(TableMaintenanceMetrics.ERROR_COUNTER); - this.rowDataReaderFunction = - new MetaDataReaderFunction( - new Configuration(), - metaTable.schema(), - projectedSchema, - metaTable.io(), - metaTable.encryption()); - this.splitSerializer = new IcebergSourceSplitSerializer(scanContext.caseSensitive()); - } - - @Override - public void processElement( - MetadataTablePlanner.SplitInfo splitInfo, Context ctx, Collector out) throws Exception { - IcebergSourceSplit split = splitSerializer.deserialize(splitInfo.version(), splitInfo.split()); - try (DataIterator iterator = rowDataReaderFunction.createDataIterator(split)) { - iterator.forEachRemaining(rowData -> extract(rowData, out)); - } catch (Exception e) { - LOG.warn("Exception processing split {} at {}", split, ctx.timestamp(), e); - ctx.output(DeleteOrphanFiles.ERROR_STREAM, e); - errorCounter.inc(); - } - } - - @Override - public void close() throws Exception { - super.close(); - tableLoader.close(); - } - - /** - * Extracts the desired data from the given RowData. - * - * @param rowData the RowData from which to extract - * @param out the Collector to which to output the extracted data - */ - abstract void extract(RowData rowData, Collector out); -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TaskResultAggregator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TaskResultAggregator.java deleted file mode 100644 index bd8f709e37ab..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TaskResultAggregator.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import java.util.List; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.TwoInputStreamOperator; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.util.OutputTag; -import org.apache.iceberg.flink.maintenance.api.TaskResult; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Aggregates results of the operators for a given maintenance task. - * - *

      - *
    • Input 1 is used: - *
        - *
      • To provide the {@link TaskResult#startEpoch()} - should be chained to the task input - *
      • To mark that the task is finished - should be chained at the end of the task, so an - * incoming watermark will signal that the task is finished - *
      - *
    • Input 2 expects an {@link Exception} which caused the failure - should be chained to the - * {@link #ERROR_STREAM} of the operators - *
    - * - * The operator emits a {@link TaskResult} with the overall result on {@link Watermark}. - */ -@Internal -public class TaskResultAggregator extends AbstractStreamOperator - implements TwoInputStreamOperator { - public static final OutputTag ERROR_STREAM = - new OutputTag<>("error-stream", TypeInformation.of(Exception.class)); - - private static final Logger LOG = LoggerFactory.getLogger(TaskResultAggregator.class); - - private final String tableName; - private final String taskName; - private final int taskIndex; - private final List exceptions; - private transient long startTime; - - public TaskResultAggregator(String tableName, String taskName, int taskIndex) { - Preconditions.checkNotNull(tableName, "Table name should no be null"); - Preconditions.checkNotNull(taskName, "Task name should no be null"); - - this.tableName = tableName; - this.taskName = taskName; - this.taskIndex = taskIndex; - this.exceptions = Lists.newArrayList(); - } - - @Override - public void processElement1(StreamRecord streamRecord) { - startTime = streamRecord.getValue().timestamp(); - } - - @Override - public void processElement2(StreamRecord streamRecord) { - Preconditions.checkNotNull(streamRecord.getValue(), "Exception could not be `null`."); - exceptions.add(streamRecord.getValue()); - } - - @Override - public void processWatermark(Watermark mark) throws Exception { - if (startTime != 0L) { - TaskResult response = new TaskResult(taskIndex, startTime, exceptions.isEmpty(), exceptions); - output.collect(new StreamRecord<>(response)); - LOG.info( - "Aggregated result for table {}, task {}[{}] is {}", - tableName, - taskName, - taskIndex, - response); - exceptions.clear(); - startTime = 0L; - } - - super.processWatermark(mark); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerEvaluator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerEvaluator.java deleted file mode 100644 index d448898bdfe6..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerEvaluator.java +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import java.io.Serializable; -import java.time.Duration; -import java.util.List; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -@Internal -public class TriggerEvaluator implements Serializable { - private static final Logger LOG = LoggerFactory.getLogger(TriggerEvaluator.class); - private final List predicates; - - private TriggerEvaluator(List predicates) { - Preconditions.checkArgument(!predicates.isEmpty(), "Provide at least 1 condition."); - - this.predicates = predicates; - } - - boolean check(TableChange event, long lastTimeMs, long currentTimeMs) { - boolean result = - predicates.stream().anyMatch(p -> p.evaluate(event, lastTimeMs, currentTimeMs)); - LOG.debug( - "Checking event: {}, at {}, last: {} with result: {}", - event, - currentTimeMs, - lastTimeMs, - result); - return result; - } - - public static class Builder implements Serializable { - private Integer dataFileCount; - private Long dataFileSizeInBytes; - private Integer posDeleteFileCount; - private Long posDeleteRecordCount; - private Integer eqDeleteFileCount; - private Long eqDeleteRecordCount; - private Integer commitCount; - private Duration timeout; - - public Builder dataFileCount(int newDataFileCount) { - this.dataFileCount = newDataFileCount; - return this; - } - - public Builder dataFileSizeInBytes(long neDataFileSizeInBytes) { - this.dataFileSizeInBytes = neDataFileSizeInBytes; - return this; - } - - public Builder posDeleteFileCount(int newPosDeleteFileCount) { - this.posDeleteFileCount = newPosDeleteFileCount; - return this; - } - - public Builder posDeleteRecordCount(long newPosDeleteRecordCount) { - this.posDeleteRecordCount = newPosDeleteRecordCount; - return this; - } - - public Builder eqDeleteFileCount(int newEqDeleteFileCount) { - this.eqDeleteFileCount = newEqDeleteFileCount; - return this; - } - - public Builder eqDeleteRecordCount(long newEqDeleteRecordCount) { - this.eqDeleteRecordCount = newEqDeleteRecordCount; - return this; - } - - public Builder commitCount(int newCommitCount) { - this.commitCount = newCommitCount; - return this; - } - - public Builder timeout(Duration newTimeout) { - this.timeout = newTimeout; - return this; - } - - public TriggerEvaluator build() { - List predicates = Lists.newArrayList(); - if (dataFileCount != null) { - predicates.add((change, unused, unused2) -> change.dataFileCount() >= dataFileCount); - } - - if (dataFileSizeInBytes != null) { - predicates.add( - (change, unused, unused2) -> change.dataFileSizeInBytes() >= dataFileSizeInBytes); - } - - if (posDeleteFileCount != null) { - predicates.add( - (change, unused, unused2) -> change.posDeleteFileCount() >= posDeleteFileCount); - } - - if (posDeleteRecordCount != null) { - predicates.add( - (change, unused, unused2) -> change.posDeleteRecordCount() >= posDeleteRecordCount); - } - - if (eqDeleteFileCount != null) { - predicates.add( - (change, unused, unused2) -> change.eqDeleteFileCount() >= eqDeleteFileCount); - } - - if (eqDeleteRecordCount != null) { - predicates.add( - (change, unused, unused2) -> change.eqDeleteRecordCount() >= eqDeleteRecordCount); - } - - if (commitCount != null) { - predicates.add((change, unused, unused2) -> change.commitCount() >= commitCount); - } - - if (timeout != null) { - predicates.add( - (change, lastTimeMs, currentTimeMs) -> - currentTimeMs - lastTimeMs >= timeout.toMillis()); - } - - return new TriggerEvaluator(predicates); - } - } - - private interface Predicate extends Serializable { - boolean evaluate(TableChange event, long lastTimeMs, long currentTimeMs); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerManager.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerManager.java deleted file mode 100644 index bd8424d726ec..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TriggerManager.java +++ /dev/null @@ -1,327 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import java.io.IOException; -import java.util.List; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.state.ListState; -import org.apache.flink.api.common.state.ListStateDescriptor; -import org.apache.flink.api.common.state.ValueState; -import org.apache.flink.api.common.state.ValueStateDescriptor; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.common.typeinfo.Types; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.metrics.Counter; -import org.apache.flink.metrics.MetricGroup; -import org.apache.flink.runtime.state.FunctionInitializationContext; -import org.apache.flink.runtime.state.FunctionSnapshotContext; -import org.apache.flink.streaming.api.TimerService; -import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; -import org.apache.flink.streaming.api.functions.KeyedProcessFunction; -import org.apache.flink.util.Collector; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.apache.iceberg.flink.maintenance.api.TriggerLockFactory; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * TriggerManager starts the Maintenance Tasks by emitting {@link Trigger} messages which are - * calculated based on the incoming {@link TableChange} messages. The TriggerManager keeps track of - * the changes since the last run of the Maintenance Tasks and triggers a new run based on the - * result of the {@link TriggerEvaluator}. - * - *

    The TriggerManager prevents overlapping Maintenance Task runs using {@link - * TriggerLockFactory.Lock}. The current implementation only handles conflicts within a single job. - * Users should avoid scheduling maintenance for the same table in different Flink jobs. - * - *

    The TriggerManager should run as a global operator. {@link KeyedProcessFunction} is used, so - * the timer functions are available, but the key is not used. - */ -@Internal -public class TriggerManager extends KeyedProcessFunction - implements CheckpointedFunction { - private static final Logger LOG = LoggerFactory.getLogger(TriggerManager.class); - - private final String tableName; - private final TriggerLockFactory lockFactory; - private final List maintenanceTaskNames; - private final List evaluators; - private final long minFireDelayMs; - private final long lockCheckDelayMs; - private transient Counter rateLimiterTriggeredCounter; - private transient Counter concurrentRunThrottledCounter; - private transient Counter nothingToTriggerCounter; - private transient List triggerCounters; - private transient ValueState nextEvaluationTimeState; - private transient ListState accumulatedChangesState; - private transient ListState lastTriggerTimesState; - private transient Long nextEvaluationTime; - private transient List accumulatedChanges; - private transient List lastTriggerTimes; - private transient TriggerLockFactory.Lock lock; - private transient TriggerLockFactory.Lock recoveryLock; - private transient boolean shouldRestoreTasks = false; - private transient boolean inited = false; - // To keep the task scheduling fair we keep the last triggered task position in memory. - // If we find a task to trigger, then we run it, but after it is finished, we start from the given - // position to prevent "starvation" of the tasks. - // When there is nothing to trigger, we start from the beginning, as the order of the tasks might - // be important (RewriteDataFiles first, and then RewriteManifestFiles later) - private transient int startsFrom = 0; - private transient boolean triggered = false; - - public TriggerManager( - TableLoader tableLoader, - TriggerLockFactory lockFactory, - List maintenanceTaskNames, - List evaluators, - long minFireDelayMs, - long lockCheckDelayMs) { - Preconditions.checkNotNull(tableLoader, "Table loader should no be null"); - Preconditions.checkNotNull(lockFactory, "Lock factory should no be null"); - Preconditions.checkArgument( - maintenanceTaskNames != null && !maintenanceTaskNames.isEmpty(), - "Invalid maintenance task names: null or empty"); - Preconditions.checkArgument( - evaluators != null && !evaluators.isEmpty(), "Invalid evaluators: null or empty"); - Preconditions.checkArgument( - maintenanceTaskNames.size() == evaluators.size(), - "Provide a name and evaluator for all of the maintenance tasks"); - Preconditions.checkArgument(minFireDelayMs > 0, "Minimum fire delay should be at least 1."); - Preconditions.checkArgument( - lockCheckDelayMs > 0, "Minimum lock delay rate should be at least 1 ms."); - - tableLoader.open(); - this.tableName = tableLoader.loadTable().name(); - this.lockFactory = lockFactory; - this.maintenanceTaskNames = maintenanceTaskNames; - this.evaluators = evaluators; - this.minFireDelayMs = minFireDelayMs; - this.lockCheckDelayMs = lockCheckDelayMs; - } - - @Override - public void open(Configuration parameters) throws Exception { - MetricGroup mainGroup = TableMaintenanceMetrics.groupFor(getRuntimeContext(), tableName); - this.rateLimiterTriggeredCounter = - mainGroup.counter(TableMaintenanceMetrics.RATE_LIMITER_TRIGGERED); - this.concurrentRunThrottledCounter = - mainGroup.counter(TableMaintenanceMetrics.CONCURRENT_RUN_THROTTLED); - this.nothingToTriggerCounter = mainGroup.counter(TableMaintenanceMetrics.NOTHING_TO_TRIGGER); - this.triggerCounters = Lists.newArrayListWithExpectedSize(maintenanceTaskNames.size()); - for (int taskIndex = 0; taskIndex < maintenanceTaskNames.size(); ++taskIndex) { - triggerCounters.add( - TableMaintenanceMetrics.groupFor( - mainGroup, maintenanceTaskNames.get(taskIndex), taskIndex) - .counter(TableMaintenanceMetrics.TRIGGERED)); - } - - this.nextEvaluationTimeState = - getRuntimeContext() - .getState(new ValueStateDescriptor<>("triggerManagerNextTriggerTime", Types.LONG)); - this.accumulatedChangesState = - getRuntimeContext() - .getListState( - new ListStateDescriptor<>( - "triggerManagerAccumulatedChange", TypeInformation.of(TableChange.class))); - this.lastTriggerTimesState = - getRuntimeContext() - .getListState(new ListStateDescriptor<>("triggerManagerLastTriggerTime", Types.LONG)); - } - - @Override - public void snapshotState(FunctionSnapshotContext context) throws Exception { - if (inited) { - // Only store state if initialized - nextEvaluationTimeState.update(nextEvaluationTime); - accumulatedChangesState.update(accumulatedChanges); - lastTriggerTimesState.update(lastTriggerTimes); - LOG.info( - "Storing state: nextEvaluationTime {}, accumulatedChanges {}, lastTriggerTimes {}", - nextEvaluationTime, - accumulatedChanges, - lastTriggerTimes); - } else { - LOG.info("Not initialized, state is not stored"); - } - } - - @Override - public void initializeState(FunctionInitializationContext context) throws Exception { - LOG.info("Initializing state restored: {}", context.isRestored()); - lockFactory.open(); - this.lock = lockFactory.createLock(); - this.recoveryLock = lockFactory.createRecoveryLock(); - if (context.isRestored()) { - shouldRestoreTasks = true; - } else { - lock.unlock(); - recoveryLock.unlock(); - } - } - - @Override - public void processElement(TableChange change, Context ctx, Collector out) - throws Exception { - init(out, ctx.timerService()); - - accumulatedChanges.forEach(tableChange -> tableChange.merge(change)); - - long current = ctx.timerService().currentProcessingTime(); - if (nextEvaluationTime == null) { - checkAndFire(current, ctx.timerService(), out); - } else { - LOG.info( - "Trigger manager rate limiter triggered current: {}, next: {}, accumulated changes: {}", - current, - nextEvaluationTime, - accumulatedChanges); - rateLimiterTriggeredCounter.inc(); - } - } - - @Override - public void onTimer(long timestamp, OnTimerContext ctx, Collector out) throws Exception { - init(out, ctx.timerService()); - this.nextEvaluationTime = null; - checkAndFire(ctx.timerService().currentProcessingTime(), ctx.timerService(), out); - } - - @Override - public void close() throws IOException { - lockFactory.close(); - } - - private void checkAndFire(long current, TimerService timerService, Collector out) { - if (shouldRestoreTasks) { - if (recoveryLock.isHeld()) { - // Recovered tasks in progress. Skip trigger check - LOG.debug("The recovery lock is still held at {}", current); - schedule(timerService, current + lockCheckDelayMs); - return; - } else { - LOG.info("The recovery is finished at {}", current); - shouldRestoreTasks = false; - } - } - - Integer taskToStart = - nextTrigger(evaluators, accumulatedChanges, lastTriggerTimes, current, startsFrom); - if (taskToStart == null) { - // Nothing to execute - if (!triggered) { - nothingToTriggerCounter.inc(); - LOG.debug("Nothing to execute at {} for collected: {}", current, accumulatedChanges); - } else { - LOG.debug("Execution check finished"); - } - - // Next time start from the beginning - startsFrom = 0; - triggered = false; - return; - } - - if (lock.tryLock()) { - TableChange change = accumulatedChanges.get(taskToStart); - out.collect(Trigger.create(current, taskToStart)); - LOG.debug("Fired event with time: {}, collected: {} for {}", current, change, tableName); - triggerCounters.get(taskToStart).inc(); - accumulatedChanges.set(taskToStart, TableChange.empty()); - lastTriggerTimes.set(taskToStart, current); - schedule(timerService, current + minFireDelayMs); - startsFrom = (taskToStart + 1) % evaluators.size(); - triggered = true; - } else { - // A task is already running, waiting for it to finish - LOG.info("Failed to acquire lock. Delaying task to {}", current + lockCheckDelayMs); - - startsFrom = taskToStart; - concurrentRunThrottledCounter.inc(); - schedule(timerService, current + lockCheckDelayMs); - } - - timerService.registerProcessingTimeTimer(nextEvaluationTime); - } - - private void schedule(TimerService timerService, long time) { - this.nextEvaluationTime = time; - timerService.registerProcessingTimeTimer(time); - } - - private static Integer nextTrigger( - List evaluators, - List changes, - List lastTriggerTimes, - long currentTime, - int startPos) { - int current = startPos; - do { - if (evaluators - .get(current) - .check(changes.get(current), lastTriggerTimes.get(current), currentTime)) { - return current; - } - - current = (current + 1) % evaluators.size(); - } while (current != startPos); - - return null; - } - - private void init(Collector out, TimerService timerService) throws Exception { - if (!inited) { - long current = timerService.currentProcessingTime(); - - // Initialize from state - this.nextEvaluationTime = nextEvaluationTimeState.value(); - this.accumulatedChanges = Lists.newArrayList(accumulatedChangesState.get()); - this.lastTriggerTimes = Lists.newArrayList(lastTriggerTimesState.get()); - - // Initialize if the state was empty - if (accumulatedChanges.isEmpty()) { - for (int i = 0; i < evaluators.size(); ++i) { - accumulatedChanges.add(TableChange.empty()); - lastTriggerTimes.add(current); - } - } - - if (shouldRestoreTasks) { - // When the job state is restored, there could be ongoing tasks. - // To prevent collision with the new triggers the following is done: - // - add a recovery lock - // - fire a recovery trigger - // This ensures that the tasks of the previous trigger are executed, and the lock is removed - // in the end. The result of the 'tryLock' is ignored as an already existing lock prevents - // collisions as well. - recoveryLock.tryLock(); - out.collect(Trigger.recovery(current)); - if (nextEvaluationTime == null) { - schedule(timerService, current + minFireDelayMs); - } - } - - inited = true; - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java deleted file mode 100644 index f7e8e0c884cf..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.flink.api.common.functions.MapFunction; -import org.apache.flink.formats.avro.AvroToRowDataConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.DataType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.iceberg.avro.AvroSchemaUtil; - -/** - * This util class converts Avro GenericRecord to Flink RowData.
    - *
    - * Internally it uses Flink {@link AvroToRowDataConverters}. Because of the precision difference - * between how Iceberg schema (micro) and Flink {@link AvroToRowDataConverters} (milli) deal with - * time type, we can't directly use the Avro Schema converted from Iceberg schema via {@link - * AvroSchemaUtil#convert(org.apache.iceberg.Schema, String)}. - */ -public class AvroGenericRecordToRowDataMapper implements MapFunction { - - private final AvroToRowDataConverters.AvroToRowDataConverter converter; - - AvroGenericRecordToRowDataMapper(RowType rowType) { - this.converter = AvroToRowDataConverters.createRowConverter(rowType); - } - - @Override - public RowData map(GenericRecord genericRecord) throws Exception { - return (RowData) converter.convert(genericRecord); - } - - /** Create a mapper based on Avro schema. */ - public static AvroGenericRecordToRowDataMapper forAvroSchema(Schema avroSchema) { - DataType dataType = AvroSchemaConverter.convertToDataType(avroSchema.toString()); - LogicalType logicalType = TypeConversions.fromDataToLogicalType(dataType); - RowType rowType = RowType.of(logicalType.getChildren().toArray(new LogicalType[0])); - return new AvroGenericRecordToRowDataMapper(rowType); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java deleted file mode 100644 index d845046cd2f6..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.util.Set; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.deletes.DeleteGranularity; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.data.RowDataProjection; -import org.apache.iceberg.io.BaseTaskWriter; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.TypeUtil; - -abstract class BaseDeltaTaskWriter extends BaseTaskWriter { - - private final Schema schema; - private final Schema deleteSchema; - private final RowDataWrapper wrapper; - private final RowDataWrapper keyWrapper; - private final RowDataProjection keyProjection; - private final boolean upsert; - - BaseDeltaTaskWriter( - PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema, - Set equalityFieldIds, - boolean upsert) { - super(spec, format, appenderFactory, fileFactory, io, targetFileSize); - this.schema = schema; - this.deleteSchema = TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)); - this.wrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); - this.keyWrapper = - new RowDataWrapper(FlinkSchemaUtil.convert(deleteSchema), deleteSchema.asStruct()); - this.keyProjection = - RowDataProjection.create(flinkSchema, schema.asStruct(), deleteSchema.asStruct()); - this.upsert = upsert; - } - - abstract RowDataDeltaWriter route(RowData row); - - RowDataWrapper wrapper() { - return wrapper; - } - - @Override - public void write(RowData row) throws IOException { - RowDataDeltaWriter writer = route(row); - - switch (row.getRowKind()) { - case INSERT: - case UPDATE_AFTER: - if (upsert) { - writer.deleteKey(keyProjection.wrap(row)); - } - writer.write(row); - break; - - case UPDATE_BEFORE: - if (upsert) { - break; // UPDATE_BEFORE is not necessary for UPSERT, we do nothing to prevent delete one - // row twice - } - writer.delete(row); - break; - case DELETE: - if (upsert) { - writer.deleteKey(keyProjection.wrap(row)); - } else { - writer.delete(row); - } - break; - - default: - throw new UnsupportedOperationException("Unknown row kind: " + row.getRowKind()); - } - } - - protected class RowDataDeltaWriter extends BaseEqualityDeltaWriter { - RowDataDeltaWriter(PartitionKey partition) { - super(partition, schema, deleteSchema, DeleteGranularity.FILE); - } - - @Override - protected StructLike asStructLike(RowData data) { - return wrapper.wrap(data); - } - - @Override - protected StructLike asStructLikeKey(RowData data) { - return keyWrapper.wrap(data); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java deleted file mode 100644 index 1cb6e013bd2c..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.stream.IntStream; -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.RowDataWrapper; - -/** - * A {@link KeySelector} that extracts the bucketId from a data row's bucket partition as the key. - * To be used with the {@link BucketPartitioner}. - */ -class BucketPartitionKeySelector implements KeySelector { - - private final Schema schema; - private final PartitionKey partitionKey; - private final RowType flinkSchema; - private final int bucketFieldPosition; - - private transient RowDataWrapper rowDataWrapper; - - BucketPartitionKeySelector(PartitionSpec partitionSpec, Schema schema, RowType flinkSchema) { - this.schema = schema; - this.partitionKey = new PartitionKey(partitionSpec, schema); - this.flinkSchema = flinkSchema; - this.bucketFieldPosition = getBucketFieldPosition(partitionSpec); - } - - private int getBucketFieldPosition(PartitionSpec partitionSpec) { - int bucketFieldId = BucketPartitionerUtil.getBucketFieldId(partitionSpec); - return IntStream.range(0, partitionSpec.fields().size()) - .filter(i -> partitionSpec.fields().get(i).fieldId() == bucketFieldId) - .toArray()[0]; - } - - private RowDataWrapper lazyRowDataWrapper() { - if (rowDataWrapper == null) { - rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); - } - - return rowDataWrapper; - } - - @Override - public Integer getKey(RowData rowData) { - partitionKey.partition(lazyRowDataWrapper().wrap(rowData)); - return partitionKey.get(bucketFieldPosition, Integer.class); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java deleted file mode 100644 index 9c9a117906e2..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import org.apache.flink.api.common.functions.Partitioner; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * This partitioner will redirect records to writers deterministically based on the Bucket partition - * spec. It'll attempt to optimize the file size written depending on whether numPartitions is - * greater, less or equal than the maxNumBuckets. Note: The current implementation only supports ONE - * bucket in the partition spec. - */ -class BucketPartitioner implements Partitioner { - - static final String BUCKET_NULL_MESSAGE = "bucketId cannot be null"; - static final String BUCKET_LESS_THAN_LOWER_BOUND_MESSAGE = - "Invalid bucket ID %s: must be non-negative."; - static final String BUCKET_GREATER_THAN_UPPER_BOUND_MESSAGE = - "Invalid bucket ID %s: must be less than bucket limit: %s."; - - private final int maxNumBuckets; - - // To hold the OFFSET of the next writer to use for any bucket, only used when writers > the - // number of buckets - private final int[] currentBucketWriterOffset; - - BucketPartitioner(PartitionSpec partitionSpec) { - this.maxNumBuckets = BucketPartitionerUtil.getMaxNumBuckets(partitionSpec); - this.currentBucketWriterOffset = new int[maxNumBuckets]; - } - - /** - * Determine the partition id based on the following criteria: If the number of writers <= the - * number of buckets, an evenly distributed number of buckets will be assigned to each writer (one - * writer -> many buckets). Conversely, if the number of writers > the number of buckets the logic - * is handled by the {@link #getPartitionWithMoreWritersThanBuckets - * getPartitionWritersGreaterThanBuckets} method. - * - * @param bucketId the bucketId for each request - * @param numPartitions the total number of partitions - * @return the partition id (writer) to use for each request - */ - @Override - public int partition(Integer bucketId, int numPartitions) { - Preconditions.checkNotNull(bucketId, BUCKET_NULL_MESSAGE); - Preconditions.checkArgument(bucketId >= 0, BUCKET_LESS_THAN_LOWER_BOUND_MESSAGE, bucketId); - Preconditions.checkArgument( - bucketId < maxNumBuckets, BUCKET_GREATER_THAN_UPPER_BOUND_MESSAGE, bucketId, maxNumBuckets); - - if (numPartitions <= maxNumBuckets) { - return bucketId % numPartitions; - } else { - return getPartitionWithMoreWritersThanBuckets(bucketId, numPartitions); - } - } - - /*- - * If the number of writers > the number of buckets each partitioner will keep a state of multiple - * writers per bucket as evenly as possible, and will round-robin the requests across them, in this - * case each writer will target only one bucket at all times (many writers -> one bucket). Example: - * Configuration: numPartitions (writers) = 5, maxBuckets = 2 - * Expected behavior: - * - Records for Bucket 0 will be "round robin" between Writers 0, 2 and 4 - * - Records for Bucket 1 will always use Writer 1 and 3 - * Notes: - * - maxNumWritersPerBucket determines when to reset the currentBucketWriterOffset to 0 for this bucketId - * - When numPartitions is not evenly divisible by maxBuckets, some buckets will have one more writer (extraWriter). - * In this example Bucket 0 has an "extra writer" to consider before resetting its offset to 0. - * - * @return the destination partition index (writer subtask id) - */ - private int getPartitionWithMoreWritersThanBuckets(int bucketId, int numPartitions) { - int currentOffset = currentBucketWriterOffset[bucketId]; - // Determine if this bucket requires an "extra writer" - int extraWriter = bucketId < (numPartitions % maxNumBuckets) ? 1 : 0; - // The max number of writers this bucket can have - int maxNumWritersPerBucket = (numPartitions / maxNumBuckets) + extraWriter; - - // Increment the writer offset or reset if it's reached the max for this bucket - int nextOffset = currentOffset == maxNumWritersPerBucket - 1 ? 0 : currentOffset + 1; - currentBucketWriterOffset[bucketId] = nextOffset; - - return bucketId + (maxNumBuckets * currentOffset); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java deleted file mode 100644 index c33207728d3e..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import java.util.Objects; -import java.util.stream.Collectors; -import org.apache.flink.api.java.tuple.Tuple2; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.transforms.PartitionSpecVisitor; - -final class BucketPartitionerUtil { - static final String BAD_NUMBER_OF_BUCKETS_ERROR_MESSAGE = - "Invalid number of buckets: %s (must be 1)"; - - private BucketPartitionerUtil() {} - - /** - * Determines whether the PartitionSpec has one and only one Bucket definition - * - * @param partitionSpec the partition spec in question - * @return whether the PartitionSpec has only one Bucket - */ - static boolean hasOneBucketField(PartitionSpec partitionSpec) { - List> bucketFields = getBucketFields(partitionSpec); - return bucketFields != null && bucketFields.size() == 1; - } - - /** - * Extracts the Bucket definition from a PartitionSpec. - * - * @param partitionSpec the partition spec in question - * @return the Bucket definition in the form of a tuple (fieldId, maxNumBuckets) - */ - private static Tuple2 getBucketFieldInfo(PartitionSpec partitionSpec) { - List> bucketFields = getBucketFields(partitionSpec); - Preconditions.checkArgument( - bucketFields.size() == 1, - BucketPartitionerUtil.BAD_NUMBER_OF_BUCKETS_ERROR_MESSAGE, - bucketFields.size()); - return bucketFields.get(0); - } - - static int getBucketFieldId(PartitionSpec partitionSpec) { - return getBucketFieldInfo(partitionSpec).f0; - } - - static int getMaxNumBuckets(PartitionSpec partitionSpec) { - return getBucketFieldInfo(partitionSpec).f1; - } - - private static List> getBucketFields(PartitionSpec spec) { - return PartitionSpecVisitor.visit(spec, new BucketPartitionSpecVisitor()).stream() - .filter(Objects::nonNull) - .collect(Collectors.toList()); - } - - private static class BucketPartitionSpecVisitor - implements PartitionSpecVisitor> { - @Override - public Tuple2 identity(int fieldId, String sourceName, int sourceId) { - return null; - } - - @Override - public Tuple2 bucket( - int fieldId, String sourceName, int sourceId, int numBuckets) { - return new Tuple2<>(fieldId, numBuckets); - } - - @Override - public Tuple2 truncate( - int fieldId, String sourceName, int sourceId, int width) { - return null; - } - - @Override - public Tuple2 year(int fieldId, String sourceName, int sourceId) { - return null; - } - - @Override - public Tuple2 month(int fieldId, String sourceName, int sourceId) { - return null; - } - - @Override - public Tuple2 day(int fieldId, String sourceName, int sourceId) { - return null; - } - - @Override - public Tuple2 hour(int fieldId, String sourceName, int sourceId) { - return null; - } - - @Override - public Tuple2 alwaysNull(int fieldId, String sourceName, int sourceId) { - return null; - } - - @Override - public Tuple2 unknown( - int fieldId, String sourceName, int sourceId, String transform) { - return null; - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java deleted file mode 100644 index 0afc07cc1977..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.time.Duration; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.util.DateTimeUtil; -import org.apache.iceberg.util.SerializableSupplier; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * A table loader that will only reload a table after a certain interval has passed. WARNING: This - * table loader should be used carefully when used with writer tasks. It could result in heavy load - * on a catalog for jobs with many writers. - */ -class CachingTableSupplier implements SerializableSupplier

  • { - - private static final Logger LOG = LoggerFactory.getLogger(CachingTableSupplier.class); - - private final Table initialTable; - private final TableLoader tableLoader; - private final Duration tableRefreshInterval; - private long lastLoadTimeMillis; - private transient Table table; - - CachingTableSupplier( - SerializableTable initialTable, TableLoader tableLoader, Duration tableRefreshInterval) { - Preconditions.checkArgument(initialTable != null, "initialTable cannot be null"); - Preconditions.checkArgument(tableLoader != null, "tableLoader cannot be null"); - Preconditions.checkArgument( - tableRefreshInterval != null, "tableRefreshInterval cannot be null"); - this.initialTable = initialTable; - this.table = initialTable; - this.tableLoader = tableLoader; - this.tableRefreshInterval = tableRefreshInterval; - this.lastLoadTimeMillis = System.currentTimeMillis(); - } - - @Override - public Table get() { - if (table == null) { - this.table = initialTable; - } - return table; - } - - Table initialTable() { - return initialTable; - } - - void refreshTable() { - if (System.currentTimeMillis() > lastLoadTimeMillis + tableRefreshInterval.toMillis()) { - try { - if (!tableLoader.isOpen()) { - tableLoader.open(); - } - - this.table = tableLoader.loadTable(); - this.lastLoadTimeMillis = System.currentTimeMillis(); - - LOG.info( - "Table {} reloaded, next min load time threshold is {}", - table.name(), - DateTimeUtil.formatTimestampMillis( - lastLoadTimeMillis + tableRefreshInterval.toMillis())); - } catch (Exception e) { - LOG.warn("An error occurred reloading table {}, table was not reloaded", table.name(), e); - } - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java deleted file mode 100644 index 1b786e46452f..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.Arrays; -import java.util.List; -import java.util.NavigableMap; -import java.util.concurrent.atomic.AtomicLong; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.util.ScanTaskUtil; - -@Internal -public class CommitSummary { - - private final AtomicLong dataFilesCount = new AtomicLong(); - private final AtomicLong dataFilesRecordCount = new AtomicLong(); - private final AtomicLong dataFilesByteCount = new AtomicLong(); - private final AtomicLong deleteFilesCount = new AtomicLong(); - private final AtomicLong deleteFilesRecordCount = new AtomicLong(); - private final AtomicLong deleteFilesByteCount = new AtomicLong(); - - public CommitSummary() {} - - public CommitSummary(NavigableMap pendingResults) { - pendingResults.values().forEach(this::addWriteResult); - } - - public void addAll(NavigableMap> pendingResults) { - pendingResults.values().forEach(writeResults -> writeResults.forEach(this::addWriteResult)); - } - - private void addWriteResult(WriteResult writeResult) { - dataFilesCount.addAndGet(writeResult.dataFiles().length); - Arrays.stream(writeResult.dataFiles()) - .forEach( - dataFile -> { - dataFilesRecordCount.addAndGet(dataFile.recordCount()); - dataFilesByteCount.addAndGet(dataFile.fileSizeInBytes()); - }); - deleteFilesCount.addAndGet(writeResult.deleteFiles().length); - Arrays.stream(writeResult.deleteFiles()) - .forEach( - deleteFile -> { - deleteFilesRecordCount.addAndGet(deleteFile.recordCount()); - long deleteBytes = ScanTaskUtil.contentSizeInBytes(deleteFile); - deleteFilesByteCount.addAndGet(deleteBytes); - }); - } - - public long dataFilesCount() { - return dataFilesCount.get(); - } - - long dataFilesRecordCount() { - return dataFilesRecordCount.get(); - } - - long dataFilesByteCount() { - return dataFilesByteCount.get(); - } - - public long deleteFilesCount() { - return deleteFilesCount.get(); - } - - long deleteFilesRecordCount() { - return deleteFilesRecordCount.get(); - } - - long deleteFilesByteCount() { - return deleteFilesByteCount.get(); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("dataFilesCount", dataFilesCount) - .add("dataFilesRecordCount", dataFilesRecordCount) - .add("dataFilesByteCount", dataFilesByteCount) - .add("deleteFilesCount", deleteFilesCount) - .add("deleteFilesRecordCount", deleteFilesRecordCount) - .add("deleteFilesByteCount", deleteFilesByteCount) - .toString(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CommittableToTableChangeConverter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CommittableToTableChangeConverter.java deleted file mode 100644 index 1369d98e432b..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CommittableToTableChangeConverter.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.Arrays; -import java.util.Map; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.functions.OpenContext; -import org.apache.flink.core.io.SimpleVersionedSerialization; -import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; -import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; -import org.apache.flink.streaming.api.functions.ProcessFunction; -import org.apache.flink.util.Collector; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.flink.maintenance.operator.TableChange; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -@Internal -public class CommittableToTableChangeConverter - extends ProcessFunction, TableChange> { - - private static final Logger LOG = - LoggerFactory.getLogger(CommittableToTableChangeConverter.class); - - private final FileIO io; - private final String tableName; - private final Map specs; - private transient String flinkJobId; - - public CommittableToTableChangeConverter( - FileIO fileIO, String tableName, Map specs) { - Preconditions.checkNotNull(fileIO, "FileIO should not be null"); - Preconditions.checkNotNull(tableName, "TableName should not be null"); - Preconditions.checkNotNull(specs, "Specs should not be null"); - this.io = fileIO; - this.tableName = tableName; - this.specs = specs; - } - - @Override - public void open(OpenContext openContext) throws Exception { - super.open(openContext); - Preconditions.checkState( - getRuntimeContext().getTaskInfo().getNumberOfParallelSubtasks() == 1, - "CommittableToTableChangeConverter must run with parallelism 1, current parallelism: %s", - getRuntimeContext().getTaskInfo().getNumberOfParallelSubtasks()); - - this.flinkJobId = getRuntimeContext().getJobInfo().getJobId().toString(); - } - - @Override - public void processElement( - CommittableMessage value, Context ctx, Collector out) - throws Exception { - if (value instanceof CommittableWithLineage) { - IcebergCommittable committable = - ((CommittableWithLineage) value).getCommittable(); - - if (committable == null || committable.manifest().length == 0) { - return; - } - - DeltaManifests deltaManifests; - WriteResult writeResult; - try { - deltaManifests = - SimpleVersionedSerialization.readVersionAndDeSerialize( - DeltaManifestsSerializer.INSTANCE, committable.manifest()); - writeResult = FlinkManifestUtil.readCompletedFiles(deltaManifests, io, specs); - } catch (Exception e) { - LOG.warn( - "Unable to read delta manifests for table {} at checkpoint {}", - tableName, - committable.checkpointId(), - e); - return; - } - - TableChange tableChange = - new TableChange( - Arrays.asList(writeResult.dataFiles()), Arrays.asList(writeResult.deleteFiles())); - out.collect(tableChange); - FlinkManifestUtil.deleteCommittedManifests( - tableName, io, deltaManifests.manifests(), flinkJobId, committable.checkpointId()); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java deleted file mode 100644 index 92c50165c0f5..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -@Internal -public class DeltaManifests { - - private static final CharSequence[] EMPTY_REF_DATA_FILES = new CharSequence[0]; - - private final ManifestFile dataManifest; - private final ManifestFile deleteManifest; - private final CharSequence[] referencedDataFiles; - - DeltaManifests(ManifestFile dataManifest, ManifestFile deleteManifest) { - this(dataManifest, deleteManifest, EMPTY_REF_DATA_FILES); - } - - DeltaManifests( - ManifestFile dataManifest, ManifestFile deleteManifest, CharSequence[] referencedDataFiles) { - Preconditions.checkNotNull(referencedDataFiles, "Referenced data files shouldn't be null."); - - this.dataManifest = dataManifest; - this.deleteManifest = deleteManifest; - this.referencedDataFiles = referencedDataFiles; - } - - ManifestFile dataManifest() { - return dataManifest; - } - - ManifestFile deleteManifest() { - return deleteManifest; - } - - CharSequence[] referencedDataFiles() { - return referencedDataFiles; - } - - public List manifests() { - List manifests = Lists.newArrayListWithCapacity(2); - if (dataManifest != null) { - manifests.add(dataManifest); - } - - if (deleteManifest != null) { - manifests.add(deleteManifest); - } - - return manifests; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java deleted file mode 100644 index 6ad41bacf337..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; -import org.apache.flink.annotation.Internal; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -@Internal -public class DeltaManifestsSerializer implements SimpleVersionedSerializer { - private static final int VERSION_1 = 1; - private static final int VERSION_2 = 2; - private static final byte[] EMPTY_BINARY = new byte[0]; - - public static final DeltaManifestsSerializer INSTANCE = new DeltaManifestsSerializer(); - - @Override - public int getVersion() { - return VERSION_2; - } - - @Override - public byte[] serialize(DeltaManifests deltaManifests) throws IOException { - Preconditions.checkNotNull( - deltaManifests, "DeltaManifests to be serialized should not be null"); - - ByteArrayOutputStream binaryOut = new ByteArrayOutputStream(); - DataOutputStream out = new DataOutputStream(binaryOut); - - byte[] dataManifestBinary = EMPTY_BINARY; - if (deltaManifests.dataManifest() != null) { - dataManifestBinary = ManifestFiles.encode(deltaManifests.dataManifest()); - } - - out.writeInt(dataManifestBinary.length); - out.write(dataManifestBinary); - - byte[] deleteManifestBinary = EMPTY_BINARY; - if (deltaManifests.deleteManifest() != null) { - deleteManifestBinary = ManifestFiles.encode(deltaManifests.deleteManifest()); - } - - out.writeInt(deleteManifestBinary.length); - out.write(deleteManifestBinary); - - CharSequence[] referencedDataFiles = deltaManifests.referencedDataFiles(); - out.writeInt(referencedDataFiles.length); - for (CharSequence referencedDataFile : referencedDataFiles) { - out.writeUTF(referencedDataFile.toString()); - } - - return binaryOut.toByteArray(); - } - - @Override - public DeltaManifests deserialize(int version, byte[] serialized) throws IOException { - if (version == VERSION_1) { - return deserializeV1(serialized); - } else if (version == VERSION_2) { - return deserializeV2(serialized); - } else { - throw new RuntimeException("Unknown serialize version: " + version); - } - } - - private DeltaManifests deserializeV1(byte[] serialized) throws IOException { - return new DeltaManifests(ManifestFiles.decode(serialized), null); - } - - private DeltaManifests deserializeV2(byte[] serialized) throws IOException { - ManifestFile dataManifest = null; - ManifestFile deleteManifest = null; - - ByteArrayInputStream binaryIn = new ByteArrayInputStream(serialized); - DataInputStream in = new DataInputStream(binaryIn); - - int dataManifestSize = in.readInt(); - if (dataManifestSize > 0) { - byte[] dataManifestBinary = new byte[dataManifestSize]; - Preconditions.checkState(in.read(dataManifestBinary) == dataManifestSize); - - dataManifest = ManifestFiles.decode(dataManifestBinary); - } - - int deleteManifestSize = in.readInt(); - if (deleteManifestSize > 0) { - byte[] deleteManifestBinary = new byte[deleteManifestSize]; - Preconditions.checkState(in.read(deleteManifestBinary) == deleteManifestSize); - - deleteManifest = ManifestFiles.decode(deleteManifestBinary); - } - - int referenceDataFileNum = in.readInt(); - CharSequence[] referencedDataFiles = new CharSequence[referenceDataFileNum]; - for (int i = 0; i < referenceDataFileNum; i++) { - referencedDataFiles[i] = in.readUTF(); - } - - return new DeltaManifests(dataManifest, deleteManifest, referencedDataFiles); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java deleted file mode 100644 index 92e47792c13b..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.Set; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.StructLikeWrapper; -import org.apache.iceberg.util.StructProjection; - -/** - * Create a {@link KeySelector} to shuffle by equality fields, to ensure same equality fields record - * will be emitted to same writer in order. - */ -@Internal -public class EqualityFieldKeySelector implements KeySelector { - - private final Schema schema; - private final RowType flinkSchema; - private final Schema deleteSchema; - - private transient RowDataWrapper rowDataWrapper; - private transient StructProjection structProjection; - private transient StructLikeWrapper structLikeWrapper; - - public EqualityFieldKeySelector( - Schema schema, RowType flinkSchema, Set equalityFieldIds) { - this.schema = schema; - this.flinkSchema = flinkSchema; - this.deleteSchema = TypeUtil.select(schema, equalityFieldIds); - } - - /** - * Construct the {@link RowDataWrapper} lazily here because few members in it are not - * serializable. In this way, we don't have to serialize them with forcing. - */ - protected RowDataWrapper lazyRowDataWrapper() { - if (rowDataWrapper == null) { - rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); - } - return rowDataWrapper; - } - - /** Construct the {@link StructProjection} lazily because it is not serializable. */ - protected StructProjection lazyStructProjection() { - if (structProjection == null) { - structProjection = StructProjection.create(schema, deleteSchema); - } - return structProjection; - } - - /** Construct the {@link StructLikeWrapper} lazily because it is not serializable. */ - protected StructLikeWrapper lazyStructLikeWrapper() { - if (structLikeWrapper == null) { - structLikeWrapper = StructLikeWrapper.forType(deleteSchema.asStruct()); - } - return structLikeWrapper; - } - - @Override - public Integer getKey(RowData row) { - RowDataWrapper wrappedRowData = lazyRowDataWrapper().wrap(row); - StructProjection projectedRowData = lazyStructProjection().wrap(wrappedRowData); - StructLikeWrapper wrapper = lazyStructLikeWrapper().set(projectedRowData); - return wrapper.hashCode(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java deleted file mode 100644 index b6f1392d1562..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java +++ /dev/null @@ -1,274 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.io.Serializable; -import java.io.UncheckedIOException; -import java.util.Map; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.MetricsConfig; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.Table; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.deletes.EqualityDeleteWriter; -import org.apache.iceberg.deletes.PositionDeleteWriter; -import org.apache.iceberg.encryption.EncryptedOutputFile; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.data.FlinkAvroWriter; -import org.apache.iceberg.flink.data.FlinkOrcWriter; -import org.apache.iceberg.flink.data.FlinkParquetWriters; -import org.apache.iceberg.io.DataWriter; -import org.apache.iceberg.io.DeleteSchemaUtil; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -public class FlinkAppenderFactory implements FileAppenderFactory, Serializable { - private final Schema schema; - private final RowType flinkSchema; - private final Map props; - private final PartitionSpec spec; - private final int[] equalityFieldIds; - private final Schema eqDeleteRowSchema; - private final Schema posDeleteRowSchema; - private final Table table; - - private RowType eqDeleteFlinkSchema = null; - private RowType posDeleteFlinkSchema = null; - - public FlinkAppenderFactory( - Table table, - Schema schema, - RowType flinkSchema, - Map props, - PartitionSpec spec, - int[] equalityFieldIds, - Schema eqDeleteRowSchema, - Schema posDeleteRowSchema) { - Preconditions.checkNotNull(table, "Table shouldn't be null"); - this.table = table; - this.schema = schema; - this.flinkSchema = flinkSchema; - this.props = props; - this.spec = spec; - this.equalityFieldIds = equalityFieldIds; - this.eqDeleteRowSchema = eqDeleteRowSchema; - this.posDeleteRowSchema = posDeleteRowSchema; - } - - private RowType lazyEqDeleteFlinkSchema() { - if (eqDeleteFlinkSchema == null) { - Preconditions.checkNotNull(eqDeleteRowSchema, "Equality delete row schema shouldn't be null"); - this.eqDeleteFlinkSchema = FlinkSchemaUtil.convert(eqDeleteRowSchema); - } - return eqDeleteFlinkSchema; - } - - private RowType lazyPosDeleteFlinkSchema() { - if (posDeleteFlinkSchema == null) { - Preconditions.checkNotNull(posDeleteRowSchema, "Pos-delete row schema shouldn't be null"); - this.posDeleteFlinkSchema = FlinkSchemaUtil.convert(posDeleteRowSchema); - } - return this.posDeleteFlinkSchema; - } - - @Override - public FileAppender newAppender(OutputFile outputFile, FileFormat format) { - MetricsConfig metricsConfig = MetricsConfig.forTable(table); - try { - switch (format) { - case AVRO: - return Avro.write(outputFile) - .createWriterFunc(ignore -> new FlinkAvroWriter(flinkSchema)) - .setAll(props) - .schema(schema) - .metricsConfig(metricsConfig) - .overwrite() - .build(); - - case ORC: - return ORC.write(outputFile) - .createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) - .setAll(props) - .metricsConfig(metricsConfig) - .schema(schema) - .overwrite() - .build(); - - case PARQUET: - return Parquet.write(outputFile) - .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(flinkSchema, msgType)) - .setAll(props) - .metricsConfig(metricsConfig) - .schema(schema) - .overwrite() - .build(); - - default: - throw new UnsupportedOperationException("Cannot write unknown file format: " + format); - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - @Override - public DataWriter newDataWriter( - EncryptedOutputFile file, FileFormat format, StructLike partition) { - return new DataWriter<>( - newAppender(file.encryptingOutputFile(), format), - format, - file.encryptingOutputFile().location(), - spec, - partition, - file.keyMetadata()); - } - - @Override - public EqualityDeleteWriter newEqDeleteWriter( - EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { - Preconditions.checkState( - equalityFieldIds != null && equalityFieldIds.length > 0, - "Equality field ids shouldn't be null or empty when creating equality-delete writer"); - Preconditions.checkNotNull( - eqDeleteRowSchema, - "Equality delete row schema shouldn't be null when creating equality-delete writer"); - - MetricsConfig metricsConfig = MetricsConfig.forTable(table); - try { - switch (format) { - case AVRO: - return Avro.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc(ignore -> new FlinkAvroWriter(lazyEqDeleteFlinkSchema())) - .withPartition(partition) - .overwrite() - .setAll(props) - .metricsConfig(metricsConfig) - .rowSchema(eqDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .equalityFieldIds(equalityFieldIds) - .buildEqualityWriter(); - - case ORC: - return ORC.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) - .withPartition(partition) - .overwrite() - .setAll(props) - .metricsConfig(metricsConfig) - .rowSchema(eqDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .equalityFieldIds(equalityFieldIds) - .buildEqualityWriter(); - - case PARQUET: - return Parquet.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc( - msgType -> FlinkParquetWriters.buildWriter(lazyEqDeleteFlinkSchema(), msgType)) - .withPartition(partition) - .overwrite() - .setAll(props) - .metricsConfig(metricsConfig) - .rowSchema(eqDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .equalityFieldIds(equalityFieldIds) - .buildEqualityWriter(); - - default: - throw new UnsupportedOperationException( - "Cannot write equality-deletes for unsupported file format: " + format); - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - @Override - public PositionDeleteWriter newPosDeleteWriter( - EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { - MetricsConfig metricsConfig = MetricsConfig.forPositionDelete(table); - try { - switch (format) { - case AVRO: - return Avro.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc(ignore -> new FlinkAvroWriter(lazyPosDeleteFlinkSchema())) - .withPartition(partition) - .overwrite() - .setAll(props) - .metricsConfig(metricsConfig) - .rowSchema(posDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .buildPositionWriter(); - - case ORC: - RowType orcPosDeleteSchema = - FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); - return ORC.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(orcPosDeleteSchema, iSchema)) - .withPartition(partition) - .overwrite() - .setAll(props) - .metricsConfig(metricsConfig) - .rowSchema(posDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .transformPaths(path -> StringData.fromString(path.toString())) - .buildPositionWriter(); - - case PARQUET: - RowType flinkPosDeleteSchema = - FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); - return Parquet.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc( - msgType -> FlinkParquetWriters.buildWriter(flinkPosDeleteSchema, msgType)) - .withPartition(partition) - .overwrite() - .setAll(props) - .metricsConfig(metricsConfig) - .rowSchema(posDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .transformPaths(path -> StringData.fromString(path.toString())) - .buildPositionWriter(); - - default: - throw new UnsupportedOperationException( - "Cannot write pos-deletes for unsupported file format: " + format); - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java deleted file mode 100644 index 2183fe062af4..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java +++ /dev/null @@ -1,293 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; -import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; - -import java.io.Serializable; -import java.util.Map; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.Table; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.data.BaseFileWriterFactory; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.data.FlinkAvroWriter; -import org.apache.iceberg.flink.data.FlinkOrcWriter; -import org.apache.iceberg.flink.data.FlinkParquetWriters; -import org.apache.iceberg.io.DeleteSchemaUtil; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -class FlinkFileWriterFactory extends BaseFileWriterFactory implements Serializable { - private RowType dataFlinkType; - private RowType equalityDeleteFlinkType; - private RowType positionDeleteFlinkType; - - FlinkFileWriterFactory( - Table table, - FileFormat dataFileFormat, - Schema dataSchema, - RowType dataFlinkType, - SortOrder dataSortOrder, - FileFormat deleteFileFormat, - int[] equalityFieldIds, - Schema equalityDeleteRowSchema, - RowType equalityDeleteFlinkType, - SortOrder equalityDeleteSortOrder, - Schema positionDeleteRowSchema, - RowType positionDeleteFlinkType) { - - super( - table, - dataFileFormat, - dataSchema, - dataSortOrder, - deleteFileFormat, - equalityFieldIds, - equalityDeleteRowSchema, - equalityDeleteSortOrder, - positionDeleteRowSchema); - - this.dataFlinkType = dataFlinkType; - this.equalityDeleteFlinkType = equalityDeleteFlinkType; - this.positionDeleteFlinkType = positionDeleteFlinkType; - } - - static Builder builderFor(Table table) { - return new Builder(table); - } - - @Override - protected void configureDataWrite(Avro.DataWriteBuilder builder) { - builder.createWriterFunc(ignore -> new FlinkAvroWriter(dataFlinkType())); - } - - @Override - protected void configureEqualityDelete(Avro.DeleteWriteBuilder builder) { - builder.createWriterFunc(ignored -> new FlinkAvroWriter(equalityDeleteFlinkType())); - } - - @Override - protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) { - int rowFieldIndex = positionDeleteFlinkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME); - if (rowFieldIndex >= 0) { - // FlinkAvroWriter accepts just the Flink type of the row ignoring the path and pos - RowType positionDeleteRowFlinkType = - (RowType) positionDeleteFlinkType().getTypeAt(rowFieldIndex); - builder.createWriterFunc(ignored -> new FlinkAvroWriter(positionDeleteRowFlinkType)); - } - } - - @Override - protected void configureDataWrite(Parquet.DataWriteBuilder builder) { - builder.createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(dataFlinkType(), msgType)); - } - - @Override - protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc( - msgType -> FlinkParquetWriters.buildWriter(equalityDeleteFlinkType(), msgType)); - } - - @Override - protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc( - msgType -> FlinkParquetWriters.buildWriter(positionDeleteFlinkType(), msgType)); - builder.transformPaths(path -> StringData.fromString(path.toString())); - } - - @Override - protected void configureDataWrite(ORC.DataWriteBuilder builder) { - builder.createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(dataFlinkType(), iSchema)); - } - - @Override - protected void configureEqualityDelete(ORC.DeleteWriteBuilder builder) { - builder.createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(equalityDeleteFlinkType(), iSchema)); - } - - @Override - protected void configurePositionDelete(ORC.DeleteWriteBuilder builder) { - builder.createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(positionDeleteFlinkType(), iSchema)); - builder.transformPaths(path -> StringData.fromString(path.toString())); - } - - private RowType dataFlinkType() { - if (dataFlinkType == null) { - Preconditions.checkNotNull(dataSchema(), "Data schema must not be null"); - this.dataFlinkType = FlinkSchemaUtil.convert(dataSchema()); - } - - return dataFlinkType; - } - - private RowType equalityDeleteFlinkType() { - if (equalityDeleteFlinkType == null) { - Preconditions.checkNotNull( - equalityDeleteRowSchema(), "Equality delete schema must not be null"); - this.equalityDeleteFlinkType = FlinkSchemaUtil.convert(equalityDeleteRowSchema()); - } - - return equalityDeleteFlinkType; - } - - private RowType positionDeleteFlinkType() { - if (positionDeleteFlinkType == null) { - // wrap the optional row schema into the position delete schema that contains path and - // position - Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(positionDeleteRowSchema()); - this.positionDeleteFlinkType = FlinkSchemaUtil.convert(positionDeleteSchema); - } - - return positionDeleteFlinkType; - } - - static class Builder { - private final Table table; - private FileFormat dataFileFormat; - private Schema dataSchema; - private RowType dataFlinkType; - private SortOrder dataSortOrder; - private FileFormat deleteFileFormat; - private int[] equalityFieldIds; - private Schema equalityDeleteRowSchema; - private RowType equalityDeleteFlinkType; - private SortOrder equalityDeleteSortOrder; - private Schema positionDeleteRowSchema; - private RowType positionDeleteFlinkType; - - Builder(Table table) { - this.table = table; - - Map properties = table.properties(); - - String dataFileFormatName = - properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); - this.dataFileFormat = FileFormat.fromString(dataFileFormatName); - - String deleteFileFormatName = - properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); - this.deleteFileFormat = FileFormat.fromString(deleteFileFormatName); - } - - Builder dataFileFormat(FileFormat newDataFileFormat) { - this.dataFileFormat = newDataFileFormat; - return this; - } - - Builder dataSchema(Schema newDataSchema) { - this.dataSchema = newDataSchema; - return this; - } - - /** - * Sets a Flink type for data. - * - *

    If not set, the value is derived from the provided Iceberg schema. - */ - Builder dataFlinkType(RowType newDataFlinkType) { - this.dataFlinkType = newDataFlinkType; - return this; - } - - Builder dataSortOrder(SortOrder newDataSortOrder) { - this.dataSortOrder = newDataSortOrder; - return this; - } - - Builder deleteFileFormat(FileFormat newDeleteFileFormat) { - this.deleteFileFormat = newDeleteFileFormat; - return this; - } - - Builder equalityFieldIds(int[] newEqualityFieldIds) { - this.equalityFieldIds = newEqualityFieldIds; - return this; - } - - Builder equalityDeleteRowSchema(Schema newEqualityDeleteRowSchema) { - this.equalityDeleteRowSchema = newEqualityDeleteRowSchema; - return this; - } - - /** - * Sets a Flink type for equality deletes. - * - *

    If not set, the value is derived from the provided Iceberg schema. - */ - Builder equalityDeleteFlinkType(RowType newEqualityDeleteFlinkType) { - this.equalityDeleteFlinkType = newEqualityDeleteFlinkType; - return this; - } - - Builder equalityDeleteSortOrder(SortOrder newEqualityDeleteSortOrder) { - this.equalityDeleteSortOrder = newEqualityDeleteSortOrder; - return this; - } - - Builder positionDeleteRowSchema(Schema newPositionDeleteRowSchema) { - this.positionDeleteRowSchema = newPositionDeleteRowSchema; - return this; - } - - /** - * Sets a Flink type for position deletes. - * - *

    If not set, the value is derived from the provided Iceberg schema. - */ - Builder positionDeleteFlinkType(RowType newPositionDeleteFlinkType) { - this.positionDeleteFlinkType = newPositionDeleteFlinkType; - return this; - } - - FlinkFileWriterFactory build() { - boolean noEqualityDeleteConf = equalityFieldIds == null && equalityDeleteRowSchema == null; - boolean fullEqualityDeleteConf = equalityFieldIds != null && equalityDeleteRowSchema != null; - Preconditions.checkArgument( - noEqualityDeleteConf || fullEqualityDeleteConf, - "Equality field IDs and equality delete row schema must be set together"); - - return new FlinkFileWriterFactory( - table, - dataFileFormat, - dataSchema, - dataFlinkType, - dataSortOrder, - deleteFileFormat, - equalityFieldIds, - equalityDeleteRowSchema, - equalityDeleteFlinkType, - equalityDeleteSortOrder, - positionDeleteRowSchema, - positionDeleteFlinkType); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java deleted file mode 100644 index 13affd8484aa..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.function.Supplier; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.ManifestWriter; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Table; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class FlinkManifestUtil { - - private static final Logger LOG = LoggerFactory.getLogger(FlinkManifestUtil.class); - private static final int FORMAT_V2 = 2; - private static final Long DUMMY_SNAPSHOT_ID = 0L; - - private FlinkManifestUtil() {} - - static ManifestFile writeDataFiles( - OutputFile outputFile, PartitionSpec spec, List dataFiles) throws IOException { - ManifestWriter writer = - ManifestFiles.write(FORMAT_V2, spec, outputFile, DUMMY_SNAPSHOT_ID); - - try (ManifestWriter closeableWriter = writer) { - closeableWriter.addAll(dataFiles); - } - - return writer.toManifestFile(); - } - - static List readDataFiles( - ManifestFile manifestFile, FileIO io, Map specsById) - throws IOException { - try (CloseableIterable dataFiles = ManifestFiles.read(manifestFile, io, specsById)) { - return Lists.newArrayList(dataFiles); - } - } - - public static ManifestOutputFileFactory createOutputFileFactory( - Supplier

    tableSupplier, - Map tableProps, - String flinkJobId, - String operatorUniqueId, - int subTaskId, - long attemptNumber) { - return new ManifestOutputFileFactory( - tableSupplier, tableProps, flinkJobId, operatorUniqueId, subTaskId, attemptNumber); - } - - /** - * Write the {@link WriteResult} to temporary manifest files. - * - * @param result all those DataFiles/DeleteFiles in this WriteResult should be written with same - * partition spec - */ - public static DeltaManifests writeCompletedFiles( - WriteResult result, Supplier outputFileSupplier, PartitionSpec spec) - throws IOException { - - ManifestFile dataManifest = null; - ManifestFile deleteManifest = null; - - // Write the completed data files into a newly created data manifest file. - if (result.dataFiles() != null && result.dataFiles().length > 0) { - dataManifest = - writeDataFiles(outputFileSupplier.get(), spec, Lists.newArrayList(result.dataFiles())); - } - - // Write the completed delete files into a newly created delete manifest file. - if (result.deleteFiles() != null && result.deleteFiles().length > 0) { - OutputFile deleteManifestFile = outputFileSupplier.get(); - - ManifestWriter deleteManifestWriter = - ManifestFiles.writeDeleteManifest(FORMAT_V2, spec, deleteManifestFile, DUMMY_SNAPSHOT_ID); - try (ManifestWriter writer = deleteManifestWriter) { - for (DeleteFile deleteFile : result.deleteFiles()) { - writer.add(deleteFile); - } - } - - deleteManifest = deleteManifestWriter.toManifestFile(); - } - - return new DeltaManifests(dataManifest, deleteManifest, result.referencedDataFiles()); - } - - public static WriteResult readCompletedFiles( - DeltaManifests deltaManifests, FileIO io, Map specsById) - throws IOException { - WriteResult.Builder builder = WriteResult.builder(); - - // Read the completed data files from persisted data manifest file. - if (deltaManifests.dataManifest() != null) { - builder.addDataFiles(readDataFiles(deltaManifests.dataManifest(), io, specsById)); - } - - // Read the completed delete files from persisted delete manifests file. - if (deltaManifests.deleteManifest() != null) { - try (CloseableIterable deleteFiles = - ManifestFiles.readDeleteManifest(deltaManifests.deleteManifest(), io, specsById)) { - builder.addDeleteFiles(deleteFiles); - } - } - - return builder.addReferencedDataFiles(deltaManifests.referencedDataFiles()).build(); - } - - public static void deleteCommittedManifests( - Table table, List manifests, String newFlinkJobId, long checkpointId) { - deleteCommittedManifests(table.name(), table.io(), manifests, newFlinkJobId, checkpointId); - } - - static void deleteCommittedManifests( - String tableName, - FileIO io, - List manifestsPath, - String newFlinkJobId, - long checkpointId) { - for (ManifestFile manifest : manifestsPath) { - try { - io.deleteFile(manifest.path()); - } catch (Exception e) { - // The flink manifests cleaning failure shouldn't abort the completed checkpoint. - String details = - MoreObjects.toStringHelper(FlinkManifestUtil.class) - .add("tableName", tableName) - .add("flinkJobId", newFlinkJobId) - .add("checkpointId", checkpointId) - .add("manifestPath", manifest) - .toString(); - LOG.warn( - "The iceberg transaction has been committed, but we failed to clean the temporary flink manifests: {}", - details, - e); - } - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java deleted file mode 100644 index bca60745d479..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java +++ /dev/null @@ -1,773 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.time.Duration; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.function.Function; -import org.apache.flink.api.common.functions.FlatMapFunction; -import org.apache.flink.api.common.functions.MapFunction; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.common.typeinfo.Types; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.DataStreamSink; -import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; -import org.apache.flink.streaming.api.functions.sink.v2.DiscardingSink; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.util.DataFormatConverters; -import org.apache.flink.table.types.DataType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionField; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Partitioning; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.FlinkWriteConf; -import org.apache.iceberg.flink.FlinkWriteOptions; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.sink.shuffle.DataStatisticsOperatorFactory; -import org.apache.iceberg.flink.sink.shuffle.RangePartitioner; -import org.apache.iceberg.flink.sink.shuffle.StatisticsOrRecord; -import org.apache.iceberg.flink.sink.shuffle.StatisticsOrRecordTypeInformation; -import org.apache.iceberg.flink.sink.shuffle.StatisticsType; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.SerializableSupplier; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class FlinkSink { - private static final Logger LOG = LoggerFactory.getLogger(FlinkSink.class); - - private static final String ICEBERG_STREAM_WRITER_NAME = - IcebergStreamWriter.class.getSimpleName(); - private static final String ICEBERG_FILES_COMMITTER_NAME = - IcebergFilesCommitter.class.getSimpleName(); - - private FlinkSink() {} - - /** - * Initialize a {@link Builder} to export the data from generic input data stream into iceberg - * table. We use {@link RowData} inside the sink connector, so users need to provide a mapper - * function and a {@link TypeInformation} to convert those generic records to a RowData - * DataStream. - * - * @param input the generic source input data stream. - * @param mapper function to convert the generic data to {@link RowData} - * @param outputType to define the {@link TypeInformation} for the input data. - * @param the data type of records. - * @return {@link Builder} to connect the iceberg table. - */ - public static Builder builderFor( - DataStream input, MapFunction mapper, TypeInformation outputType) { - return new Builder().forMapperOutputType(input, mapper, outputType); - } - - /** - * Initialize a {@link Builder} to export the data from input data stream with {@link Row}s into - * iceberg table. We use {@link RowData} inside the sink connector, so users need to provide a - * {@link TableSchema} for builder to convert those {@link Row}s to a {@link RowData} DataStream. - * - * @param input the source input data stream with {@link Row}s. - * @param tableSchema defines the {@link TypeInformation} for input data. - * @return {@link Builder} to connect the iceberg table. - * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link #forRow(DataStream, - * ResolvedSchema)} instead. - */ - @Deprecated - public static Builder forRow(DataStream input, TableSchema tableSchema) { - RowType rowType = (RowType) tableSchema.toRowDataType().getLogicalType(); - DataType[] fieldDataTypes = tableSchema.getFieldDataTypes(); - - DataFormatConverters.RowConverter rowConverter = - new DataFormatConverters.RowConverter(fieldDataTypes); - return builderFor(input, rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)) - .tableSchema(tableSchema); - } - - /** - * Initialize a {@link Builder} to export the data from input data stream with {@link Row}s into - * iceberg table. We use {@link RowData} inside the sink connector, so users need to provide a - * {@link ResolvedSchema} for builder to convert those {@link Row}s to a {@link RowData} - * DataStream. - * - * @param input the source input data stream with {@link Row}s. - * @param resolvedSchema defines the {@link TypeInformation} for input data. - * @return {@link Builder} to connect the iceberg table. - */ - public static Builder forRow(DataStream input, ResolvedSchema resolvedSchema) { - RowType rowType = (RowType) resolvedSchema.toSinkRowDataType().getLogicalType(); - DataType[] fieldDataTypes = resolvedSchema.getColumnDataTypes().toArray(DataType[]::new); - - DataFormatConverters.RowConverter rowConverter = - new DataFormatConverters.RowConverter(fieldDataTypes); - return builderFor(input, rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)) - .resolvedSchema(resolvedSchema); - } - - /** - * Initialize a {@link Builder} to export the data from input data stream with {@link RowData}s - * into iceberg table. - * - * @param input the source input data stream with {@link RowData}s. - * @return {@link Builder} to connect the iceberg table. - */ - public static Builder forRowData(DataStream input) { - return new Builder().forRowData(input); - } - - public static class Builder { - private Function> inputCreator = null; - private TableLoader tableLoader; - private Table table; - @Deprecated private TableSchema tableSchema; - private ResolvedSchema resolvedSchema; - private List equalityFieldColumns = null; - private String uidPrefix = null; - private final Map snapshotProperties = Maps.newHashMap(); - private ReadableConfig readableConfig = new Configuration(); - private final Map writeOptions = Maps.newHashMap(); - private FlinkWriteConf flinkWriteConf = null; - - private Builder() {} - - private Builder forRowData(DataStream newRowDataInput) { - this.inputCreator = ignored -> newRowDataInput; - return this; - } - - private Builder forMapperOutputType( - DataStream input, MapFunction mapper, TypeInformation outputType) { - this.inputCreator = - newUidPrefix -> { - // Input stream order is crucial for some situation(e.g. in cdc case). Therefore, we - // need to set the parallelism - // of map operator same as its input to keep map operator chaining its input, and avoid - // rebalanced by default. - SingleOutputStreamOperator inputStream = - input.map(mapper, outputType).setParallelism(input.getParallelism()); - if (newUidPrefix != null) { - inputStream.name(operatorName(newUidPrefix)).uid(newUidPrefix + "-mapper"); - } - return inputStream; - }; - return this; - } - - /** - * This iceberg {@link Table} instance is used for initializing {@link IcebergStreamWriter} - * which will write all the records into {@link DataFile}s and emit them to downstream operator. - * Providing a table would avoid so many table loading from each separate task. - * - * @param newTable the loaded iceberg table instance. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder table(Table newTable) { - this.table = newTable; - return this; - } - - /** - * The table loader is used for loading tables in {@link IcebergFilesCommitter} lazily, we need - * this loader because {@link Table} is not serializable and could not just use the loaded table - * from Builder#table in the remote task manager. - * - * @param newTableLoader to load iceberg table inside tasks. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder tableLoader(TableLoader newTableLoader) { - this.tableLoader = newTableLoader; - return this; - } - - /** - * Set the write properties for Flink sink. View the supported properties in {@link - * FlinkWriteOptions} - */ - public Builder set(String property, String value) { - writeOptions.put(property, value); - return this; - } - - /** - * Set the write properties for Flink sink. View the supported properties in {@link - * FlinkWriteOptions} - */ - public Builder setAll(Map properties) { - writeOptions.putAll(properties); - return this; - } - - /** - * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link - * #resolvedSchema(ResolvedSchema)} instead. - */ - @Deprecated - public Builder tableSchema(TableSchema newTableSchema) { - this.tableSchema = newTableSchema; - return this; - } - - public Builder resolvedSchema(ResolvedSchema newResolvedSchema) { - this.resolvedSchema = newResolvedSchema; - return this; - } - - public Builder overwrite(boolean newOverwrite) { - writeOptions.put(FlinkWriteOptions.OVERWRITE_MODE.key(), Boolean.toString(newOverwrite)); - return this; - } - - public Builder flinkConf(ReadableConfig config) { - this.readableConfig = config; - return this; - } - - /** - * Configure the write {@link DistributionMode} that the flink sink will use. Currently, flink - * support {@link DistributionMode#NONE} and {@link DistributionMode#HASH}. - * - * @param mode to specify the write distribution mode. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder distributionMode(DistributionMode mode) { - if (mode != null) { - writeOptions.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), mode.modeName()); - } - return this; - } - - /** - * Range distribution needs to collect statistics about data distribution to properly shuffle - * the records in relatively balanced way. In general, low cardinality should use {@link - * StatisticsType#Map} and high cardinality should use {@link StatisticsType#Sketch} Refer to - * {@link StatisticsType} Javadoc for more details. - * - *

    Default is {@link StatisticsType#Auto} where initially Map statistics is used. But if - * cardinality is higher than the threshold (currently 10K) as defined in {@code - * SketchUtil#OPERATOR_SKETCH_SWITCH_THRESHOLD}, statistics collection automatically switches to - * the sketch reservoir sampling. - * - *

    Explicit set the statistics type if the default behavior doesn't work. - * - * @param type to specify the statistics type for range distribution. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder rangeDistributionStatisticsType(StatisticsType type) { - if (type != null) { - writeOptions.put(FlinkWriteOptions.RANGE_DISTRIBUTION_STATISTICS_TYPE.key(), type.name()); - } - return this; - } - - /** - * If sort order contains partition columns, each sort key would map to one partition and data - * file. This relative weight can avoid placing too many small files for sort keys with low - * traffic. It is a double value that defines the minimal weight for each sort key. `0.02` means - * each key has a base weight of `2%` of the targeted traffic weight per writer task. - * - *

    E.g. the sink Iceberg table is partitioned daily by event time. Assume the data stream - * contains events from now up to 180 days ago. With event time, traffic weight distribution - * across different days typically has a long tail pattern. Current day contains the most - * traffic. The older days (long tail) contain less and less traffic. Assume writer parallelism - * is `10`. The total weight across all 180 days is `10,000`. Target traffic weight per writer - * task would be `1,000`. Assume the weight sum for the oldest 150 days is `1,000`. Normally, - * the range partitioner would put all the oldest 150 days in one writer task. That writer task - * would write to 150 small files (one per day). Keeping 150 open files can potentially consume - * large amount of memory. Flushing and uploading 150 files (however small) at checkpoint time - * can also be potentially slow. If this config is set to `0.02`. It means every sort key has a - * base weight of `2%` of targeted weight of `1,000` for every write task. It would essentially - * avoid placing more than `50` data files (one per day) on one writer task no matter how small - * they are. - * - *

    This is only applicable to {@link StatisticsType#Map} for low-cardinality scenario. For - * {@link StatisticsType#Sketch} high-cardinality sort columns, they are usually not used as - * partition columns. Otherwise, too many partitions and small files may be generated during - * write. Sketch range partitioner simply splits high-cardinality keys into ordered ranges. - * - *

    Default is {@code 0.0%}. - */ - public Builder rangeDistributionSortKeyBaseWeight(double weight) { - writeOptions.put( - FlinkWriteOptions.RANGE_DISTRIBUTION_SORT_KEY_BASE_WEIGHT.key(), Double.toString(weight)); - return this; - } - - /** - * Configuring the write parallel number for iceberg stream writer. - * - * @param newWriteParallelism the number of parallel iceberg stream writer. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder writeParallelism(int newWriteParallelism) { - writeOptions.put( - FlinkWriteOptions.WRITE_PARALLELISM.key(), Integer.toString(newWriteParallelism)); - return this; - } - - /** - * All INSERT/UPDATE_AFTER events from input stream will be transformed to UPSERT events, which - * means it will DELETE the old records and then INSERT the new records. In partitioned table, - * the partition fields should be a subset of equality fields, otherwise the old row that - * located in partition-A could not be deleted by the new row that located in partition-B. - * - * @param enabled indicate whether it should transform all INSERT/UPDATE_AFTER events to UPSERT. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder upsert(boolean enabled) { - writeOptions.put(FlinkWriteOptions.WRITE_UPSERT_ENABLED.key(), Boolean.toString(enabled)); - return this; - } - - /** - * Configuring the equality field columns for iceberg table that accept CDC or UPSERT events. - * - * @param columns defines the iceberg table's key. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder equalityFieldColumns(List columns) { - this.equalityFieldColumns = columns; - return this; - } - - /** - * Set the uid prefix for FlinkSink operators. Note that FlinkSink internally consists of - * multiple operators (like writer, committer, dummy sink etc.) Actually operator uid will be - * appended with a suffix like "uidPrefix-writer".
    - *
    - * If provided, this prefix is also applied to operator names.
    - *
    - * Flink auto generates operator uid if not set explicitly. It is a recommended - * best-practice to set uid for all operators before deploying to production. Flink has an - * option to {@code pipeline.auto-generate-uid=false} to disable auto-generation and force - * explicit setting of all operator uid.
    - *
    - * Be careful with setting this for an existing job, because now we are changing the operator - * uid from an auto-generated one to this new value. When deploying the change with a - * checkpoint, Flink won't be able to restore the previous Flink sink operator state (more - * specifically the committer operator state). You need to use {@code --allowNonRestoredState} - * to ignore the previous sink state. During restore Flink sink state is used to check if last - * commit was actually successful or not. {@code --allowNonRestoredState} can lead to data loss - * if the Iceberg commit failed in the last completed checkpoint. - * - * @param newPrefix prefix for Flink sink operator uid and name - * @return {@link Builder} to connect the iceberg table. - */ - public Builder uidPrefix(String newPrefix) { - this.uidPrefix = newPrefix; - return this; - } - - public Builder setSnapshotProperties(Map properties) { - snapshotProperties.putAll(properties); - return this; - } - - public Builder setSnapshotProperty(String property, String value) { - snapshotProperties.put(property, value); - return this; - } - - public Builder toBranch(String branch) { - writeOptions.put(FlinkWriteOptions.BRANCH.key(), branch); - return this; - } - - private DataStreamSink chainIcebergOperators() { - Preconditions.checkArgument( - inputCreator != null, - "Please use forRowData() or forMapperOutputType() to initialize the input DataStream."); - Preconditions.checkNotNull(tableLoader, "Table loader shouldn't be null"); - - DataStream rowDataInput = inputCreator.apply(uidPrefix); - - if (table == null) { - if (!tableLoader.isOpen()) { - tableLoader.open(); - } - - try (TableLoader loader = tableLoader) { - this.table = loader.loadTable(); - } catch (IOException e) { - throw new UncheckedIOException( - "Failed to load iceberg table from table loader: " + tableLoader, e); - } - } - - flinkWriteConf = new FlinkWriteConf(table, writeOptions, readableConfig); - - // Find out the equality field id list based on the user-provided equality field column names. - Set equalityFieldIds = - SinkUtil.checkAndGetEqualityFieldIds(table, equalityFieldColumns); - - RowType flinkRowType = - resolvedSchema != null - ? toFlinkRowType(table.schema(), resolvedSchema) - : toFlinkRowType(table.schema(), tableSchema); - int writerParallelism = - flinkWriteConf.writeParallelism() == null - ? rowDataInput.getParallelism() - : flinkWriteConf.writeParallelism(); - - // Distribute the records from input data stream based on the write.distribution-mode and - // equality fields. - DataStream distributeStream = - distributeDataStream(rowDataInput, equalityFieldIds, flinkRowType, writerParallelism); - - // Add parallel writers that append rows to files - SingleOutputStreamOperator writerStream = - appendWriter(distributeStream, flinkRowType, equalityFieldIds, writerParallelism); - - // Add single-parallelism committer that commits files - // after successful checkpoint or end of input - SingleOutputStreamOperator committerStream = appendCommitter(writerStream); - - // Add dummy discard sink - return appendDummySink(committerStream); - } - - /** - * Append the iceberg sink operators to write records to iceberg table. - * - * @return {@link DataStreamSink} for sink. - */ - public DataStreamSink append() { - return chainIcebergOperators(); - } - - private String operatorName(String suffix) { - return uidPrefix != null ? uidPrefix + "-" + suffix : suffix; - } - - @VisibleForTesting - List checkAndGetEqualityFieldIds() { - List equalityFieldIds = Lists.newArrayList(table.schema().identifierFieldIds()); - if (equalityFieldColumns != null && !equalityFieldColumns.isEmpty()) { - Set equalityFieldSet = - Sets.newHashSetWithExpectedSize(equalityFieldColumns.size()); - for (String column : equalityFieldColumns) { - org.apache.iceberg.types.Types.NestedField field = table.schema().findField(column); - Preconditions.checkNotNull( - field, - "Missing required equality field column '%s' in table schema %s", - column, - table.schema()); - equalityFieldSet.add(field.fieldId()); - } - - if (!equalityFieldSet.equals(table.schema().identifierFieldIds())) { - LOG.warn( - "The configured equality field column IDs {} are not matched with the schema identifier field IDs" - + " {}, use job specified equality field columns as the equality fields by default.", - equalityFieldSet, - table.schema().identifierFieldIds()); - } - equalityFieldIds = Lists.newArrayList(equalityFieldSet); - } - return equalityFieldIds; - } - - private DataStreamSink appendDummySink(SingleOutputStreamOperator committerStream) { - DataStreamSink resultStream = - committerStream - .sinkTo(new DiscardingSink<>()) - .name(operatorName(String.format("IcebergSink %s", this.table.name()))) - .setParallelism(1); - if (uidPrefix != null) { - resultStream = resultStream.uid(uidPrefix + "-dummysink"); - } - return resultStream; - } - - private SingleOutputStreamOperator appendCommitter( - SingleOutputStreamOperator writerStream) { - IcebergFilesCommitter filesCommitter = - new IcebergFilesCommitter( - tableLoader, - flinkWriteConf.overwriteMode(), - snapshotProperties, - flinkWriteConf.workerPoolSize(), - flinkWriteConf.branch(), - table.spec()); - SingleOutputStreamOperator committerStream = - writerStream - .transform(operatorName(ICEBERG_FILES_COMMITTER_NAME), Types.VOID, filesCommitter) - .setParallelism(1) - .setMaxParallelism(1); - if (uidPrefix != null) { - committerStream = committerStream.uid(uidPrefix + "-committer"); - } - return committerStream; - } - - private SingleOutputStreamOperator appendWriter( - DataStream input, - RowType flinkRowType, - Set equalityFieldIds, - int writerParallelism) { - // Validate the equality fields and partition fields if we enable the upsert mode. - if (flinkWriteConf.upsertMode()) { - Preconditions.checkState( - !flinkWriteConf.overwriteMode(), - "OVERWRITE mode shouldn't be enable when configuring to use UPSERT data stream."); - Preconditions.checkState( - !equalityFieldIds.isEmpty(), - "Equality field columns shouldn't be empty when configuring to use UPSERT data stream."); - if (!table.spec().isUnpartitioned()) { - for (PartitionField partitionField : table.spec().fields()) { - Preconditions.checkState( - equalityFieldIds.contains(partitionField.sourceId()), - "In UPSERT mode, source column '%s' of partition field '%s', should be included in equality fields: '%s'", - table.schema().findColumnName(partitionField.sourceId()), - partitionField, - equalityFieldColumns); - } - } - } - - SerializableTable serializableTable = (SerializableTable) SerializableTable.copyOf(table); - Duration tableRefreshInterval = flinkWriteConf.tableRefreshInterval(); - - SerializableSupplier

    tableSupplier; - if (tableRefreshInterval != null) { - tableSupplier = - new CachingTableSupplier(serializableTable, tableLoader, tableRefreshInterval); - } else { - tableSupplier = () -> serializableTable; - } - - IcebergStreamWriter streamWriter = - createStreamWriter(tableSupplier, flinkWriteConf, flinkRowType, equalityFieldIds); - - SingleOutputStreamOperator writerStream = - input - .transform( - operatorName(ICEBERG_STREAM_WRITER_NAME), - TypeInformation.of(FlinkWriteResult.class), - streamWriter) - .setParallelism(writerParallelism); - if (uidPrefix != null) { - writerStream = writerStream.uid(uidPrefix + "-writer"); - } - return writerStream; - } - - private DataStream distributeDataStream( - DataStream input, - Set equalityFieldIds, - RowType flinkRowType, - int writerParallelism) { - DistributionMode writeMode = flinkWriteConf.distributionMode(); - LOG.info("Write distribution mode is '{}'", writeMode.modeName()); - - Schema iSchema = table.schema(); - PartitionSpec partitionSpec = table.spec(); - SortOrder sortOrder = table.sortOrder(); - - switch (writeMode) { - case NONE: - if (equalityFieldIds.isEmpty()) { - return input; - } else { - LOG.info("Distribute rows by equality fields, because there are equality fields set"); - return input.keyBy( - new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); - } - - case HASH: - if (equalityFieldIds.isEmpty()) { - if (partitionSpec.isUnpartitioned()) { - LOG.warn( - "Fallback to use 'none' distribution mode, because there are no equality fields set " - + "and table is unpartitioned"); - return input; - } else { - return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); - } - } else { - if (partitionSpec.isUnpartitioned()) { - LOG.info( - "Distribute rows by equality fields, because there are equality fields set " - + "and table is unpartitioned"); - return input.keyBy( - new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); - } else { - for (PartitionField partitionField : partitionSpec.fields()) { - Preconditions.checkState( - equalityFieldIds.contains(partitionField.sourceId()), - "In 'hash' distribution mode with equality fields set, source column '%s' of partition field '%s' " - + "should be included in equality fields: '%s'", - table.schema().findColumnName(partitionField.sourceId()), - partitionField, - equalityFieldColumns); - } - return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); - } - } - - case RANGE: - // Ideally, exception should be thrown in the combination of range distribution and - // equality fields. Primary key case should use hash distribution mode. - // Keep the current behavior of falling back to keyBy for backward compatibility. - if (!equalityFieldIds.isEmpty()) { - LOG.warn( - "Hash distribute rows by equality fields, even though {}=range is set. " - + "Range distribution for primary keys are not always safe in " - + "Flink streaming writer.", - WRITE_DISTRIBUTION_MODE); - return input.keyBy( - new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); - } - - // range distribute by partition key or sort key if table has an SortOrder - Preconditions.checkState( - sortOrder.isSorted() || partitionSpec.isPartitioned(), - "Invalid write distribution mode: range. Need to define sort order or partition spec."); - if (sortOrder.isUnsorted()) { - sortOrder = Partitioning.sortOrderFor(partitionSpec); - LOG.info("Construct sort order from partition spec"); - } - - LOG.info("Range distribute rows by sort order: {}", sortOrder); - StatisticsOrRecordTypeInformation statisticsOrRecordTypeInformation = - new StatisticsOrRecordTypeInformation(flinkRowType, iSchema, sortOrder); - StatisticsType statisticsType = flinkWriteConf.rangeDistributionStatisticsType(); - SingleOutputStreamOperator shuffleStream = - input - .transform( - operatorName("range-shuffle"), - statisticsOrRecordTypeInformation, - new DataStatisticsOperatorFactory( - iSchema, - sortOrder, - writerParallelism, - statisticsType, - flinkWriteConf.rangeDistributionSortKeyBaseWeight())) - // Set the parallelism same as input operator to encourage chaining - .setParallelism(input.getParallelism()); - if (uidPrefix != null) { - shuffleStream = shuffleStream.uid(uidPrefix + "-shuffle"); - } - - return shuffleStream - .partitionCustom(new RangePartitioner(iSchema, sortOrder), r -> r) - .flatMap( - (FlatMapFunction) - (statisticsOrRecord, out) -> { - if (statisticsOrRecord.hasRecord()) { - out.collect(statisticsOrRecord.record()); - } - }) - // Set the parallelism same as writerParallelism to - // promote operator chaining with the downstream writer operator - .setParallelism(writerParallelism) - .returns(RowData.class); - - default: - throw new RuntimeException("Unrecognized " + WRITE_DISTRIBUTION_MODE + ": " + writeMode); - } - } - } - - /** - * Clean up after removing {@link Builder#tableSchema} - * - * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link #toFlinkRowType(Schema, - * ResolvedSchema)} instead. - */ - @Deprecated - static RowType toFlinkRowType(Schema schema, TableSchema requestedSchema) { - if (requestedSchema != null) { - // Convert the flink schema to iceberg schema firstly, then reassign ids to match the existing - // iceberg schema. - Schema writeSchema = TypeUtil.reassignIds(FlinkSchemaUtil.convert(requestedSchema), schema); - TypeUtil.validateWriteSchema(schema, writeSchema, true, true); - - // We use this flink schema to read values from RowData. The flink's TINYINT and SMALLINT will - // be promoted to iceberg INTEGER, that means if we use iceberg's table schema to read TINYINT - // (backend by 1 'byte'), we will read 4 bytes rather than 1 byte, it will mess up the - // byte array in BinaryRowData. So here we must use flink schema. - return (RowType) requestedSchema.toRowDataType().getLogicalType(); - } else { - return FlinkSchemaUtil.convert(schema); - } - } - - static RowType toFlinkRowType(Schema schema, ResolvedSchema requestedSchema) { - if (requestedSchema != null) { - // Convert the flink schema to iceberg schema firstly, then reassign ids to match the existing - // iceberg schema. - Schema writeSchema = TypeUtil.reassignIds(FlinkSchemaUtil.convert(requestedSchema), schema); - TypeUtil.validateWriteSchema(schema, writeSchema, true, true); - - // We use this flink schema to read values from RowData. The flink's TINYINT and SMALLINT will - // be promoted to iceberg INTEGER, that means if we use iceberg's table schema to read TINYINT - // (backend by 1 'byte'), we will read 4 bytes rather than 1 byte, it will mess up the - // byte array in BinaryRowData. So here we must use flink schema. - return (RowType) requestedSchema.toSinkRowDataType().getLogicalType(); - } else { - return FlinkSchemaUtil.convert(schema); - } - } - - static IcebergStreamWriter createStreamWriter( - SerializableSupplier
    tableSupplier, - FlinkWriteConf flinkWriteConf, - RowType flinkRowType, - Set equalityFieldIds) { - Preconditions.checkArgument(tableSupplier != null, "Iceberg table supplier shouldn't be null"); - - Table initTable = tableSupplier.get(); - FileFormat format = flinkWriteConf.dataFileFormat(); - TaskWriterFactory taskWriterFactory = - new RowDataTaskWriterFactory( - tableSupplier, - flinkRowType, - flinkWriteConf.targetDataFileSize(), - format, - SinkUtil.writeProperties(format, flinkWriteConf, initTable), - equalityFieldIds, - flinkWriteConf.upsertMode()); - - return new IcebergStreamWriter<>(initTable.name(), taskWriterFactory); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkWriteResult.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkWriteResult.java deleted file mode 100644 index 317fb169ae1b..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkWriteResult.java +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.Serializable; -import org.apache.iceberg.io.WriteResult; - -public class FlinkWriteResult implements Serializable { - private final long checkpointId; - private final WriteResult writeResult; - - public FlinkWriteResult(long checkpointId, WriteResult writeResult) { - this.checkpointId = checkpointId; - this.writeResult = writeResult; - } - - public long checkpointId() { - return checkpointId; - } - - public WriteResult writeResult() { - return writeResult; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittable.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittable.java deleted file mode 100644 index 408c3e9a9d5f..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittable.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.Serializable; -import java.util.Arrays; -import java.util.Objects; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; - -/** - * The aggregated results of a single checkpoint which should be committed. Containing the - * serialized {@link org.apache.iceberg.flink.sink.DeltaManifests} file - which contains the commit - * data, and the jobId, operatorId, checkpointId triplet which helps identifying the specific commit - * - *

    {@link IcebergCommittableSerializer} is used for serializing the objects between the Writer - * and the Aggregator operator and between the Aggregator and the Committer as well. - */ -class IcebergCommittable implements Serializable { - private final byte[] manifest; - private final String jobId; - private final String operatorId; - private final long checkpointId; - - IcebergCommittable(byte[] manifest, String jobId, String operatorId, long checkpointId) { - this.manifest = manifest; - this.jobId = jobId; - this.operatorId = operatorId; - this.checkpointId = checkpointId; - } - - byte[] manifest() { - return manifest; - } - - String jobId() { - return jobId; - } - - String operatorId() { - return operatorId; - } - - Long checkpointId() { - return checkpointId; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("jobId", jobId) - .add("checkpointId", checkpointId) - .add("operatorId", operatorId) - .toString(); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (o == null || getClass() != o.getClass()) { - return false; - } - - IcebergCommittable that = (IcebergCommittable) o; - return checkpointId == that.checkpointId - && Arrays.equals(manifest, that.manifest) - && Objects.equals(jobId, that.jobId) - && Objects.equals(operatorId, that.operatorId); - } - - @Override - public int hashCode() { - int result = Objects.hash(jobId, operatorId, checkpointId); - result = 31 * result + Arrays.hashCode(manifest); - return result; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittableSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittableSerializer.java deleted file mode 100644 index 1d83c211e001..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommittableSerializer.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputViewStreamWrapper; - -/** - * This serializer is used for serializing the {@link IcebergCommittable} objects between the Writer - * and the Aggregator operator and between the Aggregator and the Committer as well. - * - *

    In both cases only the respective part is serialized. - */ -public class IcebergCommittableSerializer implements SimpleVersionedSerializer { - private static final int VERSION = 1; - - @Override - public int getVersion() { - return VERSION; - } - - @Override - public byte[] serialize(IcebergCommittable committable) throws IOException { - ByteArrayOutputStream out = new ByteArrayOutputStream(); - DataOutputViewStreamWrapper view = new DataOutputViewStreamWrapper(out); - view.writeUTF(committable.jobId()); - view.writeUTF(committable.operatorId()); - view.writeLong(committable.checkpointId()); - view.writeInt(committable.manifest().length); - view.write(committable.manifest()); - return out.toByteArray(); - } - - @Override - public IcebergCommittable deserialize(int version, byte[] serialized) throws IOException { - if (version == 1) { - DataInputDeserializer view = new DataInputDeserializer(serialized); - String jobId = view.readUTF(); - String operatorId = view.readUTF(); - long checkpointId = view.readLong(); - int manifestLen = view.readInt(); - byte[] manifestBuf; - manifestBuf = new byte[manifestLen]; - view.read(manifestBuf); - return new IcebergCommittable(manifestBuf, jobId, operatorId, checkpointId); - } - throw new IOException("Unrecognized version or corrupt state: " + version); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommitter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommitter.java deleted file mode 100644 index c05e7d918093..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergCommitter.java +++ /dev/null @@ -1,317 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.util.Arrays; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.NavigableMap; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.TimeUnit; -import org.apache.flink.api.connector.sink2.Committer; -import org.apache.flink.core.io.SimpleVersionedSerialization; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ReplacePartitions; -import org.apache.iceberg.RowDelta; -import org.apache.iceberg.SnapshotUpdate; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.PropertyUtil; -import org.apache.iceberg.util.ThreadPools; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * This class implements the Flink SinkV2 {@link Committer} interface to implement the Iceberg - * commits. The implementation builds on the following assumptions: - * - *

      - *
    • There is a single {@link IcebergCommittable} for every checkpoint - *
    • There is no late checkpoint - if checkpoint 'x' has received in one call, then after a - * successful run only checkpoints > x will arrive - *
    • There is no other writer which would generate another commit to the same branch with the - * same jobId-operatorId-checkpointId triplet - *
    - */ -class IcebergCommitter implements Committer { - private static final Logger LOG = LoggerFactory.getLogger(IcebergCommitter.class); - private static final byte[] EMPTY_MANIFEST_DATA = new byte[0]; - public static final WriteResult EMPTY_WRITE_RESULT = - WriteResult.builder() - .addDataFiles(Lists.newArrayList()) - .addDeleteFiles(Lists.newArrayList()) - .build(); - - @VisibleForTesting - static final String MAX_CONTINUOUS_EMPTY_COMMITS = "flink.max-continuous-empty-commits"; - - private final String branch; - private final Map snapshotProperties; - private final boolean replacePartitions; - private IcebergFilesCommitterMetrics committerMetrics; - private Table table; - private final TableLoader tableLoader; - private int maxContinuousEmptyCommits; - private ExecutorService workerPool; - private int continuousEmptyCheckpoints = 0; - private boolean compactMode = false; - - IcebergCommitter( - TableLoader tableLoader, - String branch, - Map snapshotProperties, - boolean replacePartitions, - int workerPoolSize, - String sinkId, - IcebergFilesCommitterMetrics committerMetrics, - boolean compactMode) { - this.branch = branch; - this.snapshotProperties = snapshotProperties; - this.replacePartitions = replacePartitions; - this.committerMetrics = committerMetrics; - this.tableLoader = tableLoader; - if (!tableLoader.isOpen()) { - tableLoader.open(); - } - - this.table = tableLoader.loadTable(); - this.maxContinuousEmptyCommits = - PropertyUtil.propertyAsInt(table.properties(), MAX_CONTINUOUS_EMPTY_COMMITS, 10); - Preconditions.checkArgument( - maxContinuousEmptyCommits > 0, MAX_CONTINUOUS_EMPTY_COMMITS + " must be positive"); - this.workerPool = - ThreadPools.newFixedThreadPool( - "iceberg-committer-pool-" + table.name() + "-" + sinkId, workerPoolSize); - this.continuousEmptyCheckpoints = 0; - this.compactMode = compactMode; - } - - @Override - public void commit(Collection> commitRequests) - throws IOException, InterruptedException { - if (commitRequests.isEmpty()) { - return; - } - - NavigableMap> commitRequestMap = Maps.newTreeMap(); - for (CommitRequest request : commitRequests) { - commitRequestMap.put(request.getCommittable().checkpointId(), request); - } - - IcebergCommittable last = commitRequestMap.lastEntry().getValue().getCommittable(); - long maxCommittedCheckpointId = - SinkUtil.getMaxCommittedCheckpointId(table, last.jobId(), last.operatorId(), branch); - // Mark the already committed FilesCommittable(s) as finished - commitRequestMap - .headMap(maxCommittedCheckpointId, true) - .values() - .forEach(CommitRequest::signalAlreadyCommitted); - NavigableMap> uncommitted = - commitRequestMap.tailMap(maxCommittedCheckpointId, false); - if (!uncommitted.isEmpty()) { - commitPendingRequests(uncommitted, last.jobId(), last.operatorId()); - } - } - - /** - * Commits the data to the Iceberg table by reading the file data from the {@link - * org.apache.iceberg.flink.sink.DeltaManifests} ordered by the checkpointId, and writing the new - * snapshot to the Iceberg table. The {@link org.apache.iceberg.SnapshotSummary} will contain the - * jobId, snapshotId, checkpointId so in case of job restart we can identify which changes are - * committed, and which are still waiting for the commit. - * - * @param commitRequestMap The checkpointId to {@link CommitRequest} map of the changes to commit - * @param newFlinkJobId The jobId to store in the {@link org.apache.iceberg.SnapshotSummary} - * @param operatorId The operatorId to store in the {@link org.apache.iceberg.SnapshotSummary} - * @throws IOException On commit failure - */ - private void commitPendingRequests( - NavigableMap> commitRequestMap, - String newFlinkJobId, - String operatorId) - throws IOException { - long checkpointId = commitRequestMap.lastKey(); - List manifests = Lists.newArrayList(); - NavigableMap pendingResults = Maps.newTreeMap(); - for (Map.Entry> e : commitRequestMap.entrySet()) { - if (Arrays.equals(EMPTY_MANIFEST_DATA, e.getValue().getCommittable().manifest())) { - pendingResults.put(e.getKey(), EMPTY_WRITE_RESULT); - } else { - DeltaManifests deltaManifests = - SimpleVersionedSerialization.readVersionAndDeSerialize( - DeltaManifestsSerializer.INSTANCE, e.getValue().getCommittable().manifest()); - pendingResults.put( - e.getKey(), - FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io(), table.specs())); - manifests.addAll(deltaManifests.manifests()); - } - } - - CommitSummary summary = new CommitSummary(pendingResults); - commitPendingResult(pendingResults, summary, newFlinkJobId, operatorId); - if (committerMetrics != null) { - committerMetrics.updateCommitSummary(summary); - } - - if (!compactMode) { - FlinkManifestUtil.deleteCommittedManifests(table, manifests, newFlinkJobId, checkpointId); - } - } - - private void logCommitSummary(CommitSummary summary, String description) { - LOG.info( - "Preparing for commit: {} on table: {} branch: {} with summary: {}.", - description, - table, - branch, - summary); - } - - private void commitPendingResult( - NavigableMap pendingResults, - CommitSummary summary, - String newFlinkJobId, - String operatorId) { - long totalFiles = summary.dataFilesCount() + summary.deleteFilesCount(); - continuousEmptyCheckpoints = totalFiles == 0 ? continuousEmptyCheckpoints + 1 : 0; - if (totalFiles != 0 || continuousEmptyCheckpoints % maxContinuousEmptyCommits == 0) { - if (replacePartitions) { - replacePartitions(pendingResults, summary, newFlinkJobId, operatorId); - } else { - commitDeltaTxn(pendingResults, summary, newFlinkJobId, operatorId); - } - continuousEmptyCheckpoints = 0; - } else { - long checkpointId = pendingResults.lastKey(); - LOG.info("Skip commit for checkpoint {} due to no data files or delete files.", checkpointId); - } - } - - private void replacePartitions( - NavigableMap pendingResults, - CommitSummary summary, - String newFlinkJobId, - String operatorId) { - long checkpointId = pendingResults.lastKey(); - Preconditions.checkState( - summary.deleteFilesCount() == 0, "Cannot overwrite partitions with delete files."); - // Commit the overwrite transaction. - ReplacePartitions dynamicOverwrite = table.newReplacePartitions().scanManifestsWith(workerPool); - for (WriteResult result : pendingResults.values()) { - Preconditions.checkState( - result.referencedDataFiles().length == 0, "Should have no referenced data files."); - Arrays.stream(result.dataFiles()).forEach(dynamicOverwrite::addFile); - } - String description = "dynamic partition overwrite"; - - logCommitSummary(summary, description); - commitOperation(dynamicOverwrite, description, newFlinkJobId, operatorId, checkpointId); - } - - private void commitDeltaTxn( - NavigableMap pendingResults, - CommitSummary summary, - String newFlinkJobId, - String operatorId) { - long checkpointId = pendingResults.lastKey(); - if (summary.deleteFilesCount() == 0) { - // To be compatible with iceberg format V1. - AppendFiles appendFiles = table.newAppend().scanManifestsWith(workerPool); - for (WriteResult result : pendingResults.values()) { - Preconditions.checkState( - result.referencedDataFiles().length == 0, - "Should have no referenced data files for append."); - Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); - } - String description = "append"; - logCommitSummary(summary, description); - // fail all commits as really its only one - commitOperation(appendFiles, description, newFlinkJobId, operatorId, checkpointId); - } else { - // To be compatible with iceberg format V2. - for (Map.Entry e : pendingResults.entrySet()) { - // We don't commit the merged result into a single transaction because for the sequential - // transaction txn1 and txn2, the equality-delete files of txn2 are required to be applied - // to data files from txn1. Committing the merged one will lead to the incorrect delete - // semantic. - WriteResult result = e.getValue(); - - // Row delta validations are not needed for streaming changes that write equality deletes. - // Equality deletes are applied to data in all previous sequence numbers, so retries may - // push deletes further in the future, but do not affect correctness. Position deletes - // committed to the table in this path are used only to delete rows from data files that are - // being added in this commit. There is no way for data files added along with the delete - // files to be concurrently removed, so there is no need to validate the files referenced by - // the position delete files that are being committed. - RowDelta rowDelta = table.newRowDelta().scanManifestsWith(workerPool); - - Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows); - Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); - - String description = "rowDelta"; - logCommitSummary(summary, description); - commitOperation(rowDelta, description, newFlinkJobId, operatorId, e.getKey()); - } - } - } - - private void commitOperation( - SnapshotUpdate operation, - String description, - String newFlinkJobId, - String operatorId, - long checkpointId) { - - snapshotProperties.forEach(operation::set); - // custom snapshot metadata properties will be overridden if they conflict with internal ones - // used by the sink. - operation.set(SinkUtil.MAX_COMMITTED_CHECKPOINT_ID, Long.toString(checkpointId)); - operation.set(SinkUtil.FLINK_JOB_ID, newFlinkJobId); - operation.set(SinkUtil.OPERATOR_ID, operatorId); - operation.toBranch(branch); - - long startNano = System.nanoTime(); - operation.commit(); // abort is automatically called if this fails. - long durationMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano); - LOG.info( - "Committed {} to table: {}, branch: {}, checkpointId {} in {} ms", - description, - table.name(), - branch, - checkpointId, - durationMs); - if (committerMetrics != null) { - committerMetrics.commitDuration(durationMs); - } - } - - @Override - public void close() throws IOException { - tableLoader.close(); - workerPool.shutdown(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java deleted file mode 100644 index b510dce28bac..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java +++ /dev/null @@ -1,483 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.NavigableMap; -import java.util.SortedMap; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.TimeUnit; -import org.apache.flink.api.common.state.ListState; -import org.apache.flink.api.common.state.ListStateDescriptor; -import org.apache.flink.api.common.typeinfo.BasicTypeInfo; -import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo; -import org.apache.flink.core.io.SimpleVersionedSerialization; -import org.apache.flink.runtime.state.StateInitializationContext; -import org.apache.flink.runtime.state.StateSnapshotContext; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.BoundedOneInput; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.table.runtime.typeutils.SortedMapTypeInfo; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.ReplacePartitions; -import org.apache.iceberg.RowDelta; -import org.apache.iceberg.SnapshotUpdate; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.base.Strings; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Comparators; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.PropertyUtil; -import org.apache.iceberg.util.ThreadPools; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -class IcebergFilesCommitter extends AbstractStreamOperator - implements OneInputStreamOperator, BoundedOneInput { - - private static final long serialVersionUID = 1L; - private static final long INITIAL_CHECKPOINT_ID = -1L; - private static final byte[] EMPTY_MANIFEST_DATA = new byte[0]; - - private static final Logger LOG = LoggerFactory.getLogger(IcebergFilesCommitter.class); - private static final String FLINK_JOB_ID = "flink.job-id"; - private static final String OPERATOR_ID = "flink.operator-id"; - - // The max checkpoint id we've committed to iceberg table. As the flink's checkpoint is always - // increasing, so we could correctly commit all the data files whose checkpoint id is greater than - // the max committed one to iceberg table, for avoiding committing the same data files twice. This - // id will be attached to iceberg's meta when committing the iceberg transaction. - private static final String MAX_COMMITTED_CHECKPOINT_ID = "flink.max-committed-checkpoint-id"; - static final String MAX_CONTINUOUS_EMPTY_COMMITS = "flink.max-continuous-empty-commits"; - - // TableLoader to load iceberg table lazily. - private final TableLoader tableLoader; - private final boolean replacePartitions; - private final Map snapshotProperties; - - // A sorted map to maintain the completed data files for each pending checkpointId (which have not - // been committed to iceberg table). We need a sorted map here because there's possible that few - // checkpoints snapshot failed, for example: the 1st checkpoint have 2 data files <1, >, the 2st checkpoint have 1 data files <2, >. Snapshot for checkpoint#1 - // interrupted because of network/disk failure etc, while we don't expect any data loss in iceberg - // table. So we keep the finished files <1, > in memory and retry to commit iceberg - // table when the next checkpoint happen. - private final NavigableMap dataFilesPerCheckpoint = Maps.newTreeMap(); - - // The completed files cache for current checkpoint. Once the snapshot barrier received, it will - // be flushed to the 'dataFilesPerCheckpoint'. - private final Map> writeResultsSinceLastSnapshot = Maps.newHashMap(); - private final String branch; - - // It will have an unique identifier for one job. - private transient String flinkJobId; - private transient String operatorUniqueId; - private transient Table table; - private transient IcebergFilesCommitterMetrics committerMetrics; - private transient ManifestOutputFileFactory manifestOutputFileFactory; - private transient long maxCommittedCheckpointId; - private transient int continuousEmptyCheckpoints; - private transient int maxContinuousEmptyCommits; - // There're two cases that we restore from flink checkpoints: the first case is restoring from - // snapshot created by the same flink job; another case is restoring from snapshot created by - // another different job. For the second case, we need to maintain the old flink job's id in flink - // state backend to find the max-committed-checkpoint-id when traversing iceberg table's - // snapshots. - private static final ListStateDescriptor JOB_ID_DESCRIPTOR = - new ListStateDescriptor<>("iceberg-flink-job-id", BasicTypeInfo.STRING_TYPE_INFO); - private transient ListState jobIdState; - // All pending checkpoints states for this function. - private static final ListStateDescriptor> STATE_DESCRIPTOR = - buildStateDescriptor(); - private transient ListState> checkpointsState; - - private final Integer workerPoolSize; - private final PartitionSpec spec; - private transient ExecutorService workerPool; - - IcebergFilesCommitter( - TableLoader tableLoader, - boolean replacePartitions, - Map snapshotProperties, - Integer workerPoolSize, - String branch, - PartitionSpec spec) { - this.tableLoader = tableLoader; - this.replacePartitions = replacePartitions; - this.snapshotProperties = snapshotProperties; - this.workerPoolSize = workerPoolSize; - this.branch = branch; - this.spec = spec; - } - - @Override - public void initializeState(StateInitializationContext context) throws Exception { - super.initializeState(context); - this.flinkJobId = getContainingTask().getEnvironment().getJobID().toString(); - this.operatorUniqueId = getRuntimeContext().getOperatorUniqueID(); - - // Open the table loader and load the table. - this.tableLoader.open(); - this.table = tableLoader.loadTable(); - this.committerMetrics = new IcebergFilesCommitterMetrics(super.metrics, table.name()); - - maxContinuousEmptyCommits = - PropertyUtil.propertyAsInt(table.properties(), MAX_CONTINUOUS_EMPTY_COMMITS, 10); - Preconditions.checkArgument( - maxContinuousEmptyCommits > 0, MAX_CONTINUOUS_EMPTY_COMMITS + " must be positive"); - - int subTaskId = getRuntimeContext().getIndexOfThisSubtask(); - int attemptId = getRuntimeContext().getAttemptNumber(); - this.manifestOutputFileFactory = - FlinkManifestUtil.createOutputFileFactory( - () -> table, table.properties(), flinkJobId, operatorUniqueId, subTaskId, attemptId); - this.maxCommittedCheckpointId = INITIAL_CHECKPOINT_ID; - - this.checkpointsState = context.getOperatorStateStore().getListState(STATE_DESCRIPTOR); - this.jobIdState = context.getOperatorStateStore().getListState(JOB_ID_DESCRIPTOR); - if (context.isRestored()) { - Iterable jobIdIterable = jobIdState.get(); - if (jobIdIterable == null || !jobIdIterable.iterator().hasNext()) { - LOG.warn( - "Failed to restore committer state. This can happen when operator uid changed and Flink " - + "allowNonRestoredState is enabled. Best practice is to explicitly set the operator id " - + "via FlinkSink#Builder#uidPrefix() so that the committer operator uid is stable. " - + "Otherwise, Flink auto generate an operator uid based on job topology." - + "With that, operator uid is subjective to change upon topology change."); - return; - } - - String restoredFlinkJobId = jobIdIterable.iterator().next(); - Preconditions.checkState( - !Strings.isNullOrEmpty(restoredFlinkJobId), - "Flink job id parsed from checkpoint snapshot shouldn't be null or empty"); - - // Since flink's checkpoint id will start from the max-committed-checkpoint-id + 1 in the new - // flink job even if it's restored from a snapshot created by another different flink job, so - // it's safe to assign the max committed checkpoint id from restored flink job to the current - // flink job. - this.maxCommittedCheckpointId = - SinkUtil.getMaxCommittedCheckpointId(table, restoredFlinkJobId, operatorUniqueId, branch); - - NavigableMap uncommittedDataFiles = - Maps.newTreeMap(checkpointsState.get().iterator().next()) - .tailMap(maxCommittedCheckpointId, false); - if (!uncommittedDataFiles.isEmpty()) { - // Committed all uncommitted data files from the old flink job to iceberg table. - long maxUncommittedCheckpointId = uncommittedDataFiles.lastKey(); - commitUpToCheckpoint( - uncommittedDataFiles, restoredFlinkJobId, operatorUniqueId, maxUncommittedCheckpointId); - } - } - } - - @Override - public void snapshotState(StateSnapshotContext context) throws Exception { - super.snapshotState(context); - long checkpointId = context.getCheckpointId(); - LOG.info( - "Start to flush snapshot state to state backend, table: {}, checkpointId: {}", - table, - checkpointId); - - // Update the checkpoint state. - long startNano = System.nanoTime(); - writeToManifestUptoLatestCheckpoint(checkpointId); - - // Reset the snapshot state to the latest state. - checkpointsState.clear(); - checkpointsState.add(dataFilesPerCheckpoint); - - jobIdState.clear(); - jobIdState.add(flinkJobId); - - committerMetrics.checkpointDuration( - TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano)); - } - - @Override - public void notifyCheckpointComplete(long checkpointId) throws Exception { - super.notifyCheckpointComplete(checkpointId); - // It's possible that we have the following events: - // 1. snapshotState(ckpId); - // 2. snapshotState(ckpId+1); - // 3. notifyCheckpointComplete(ckpId+1); - // 4. notifyCheckpointComplete(ckpId); - // For step#4, we don't need to commit iceberg table again because in step#3 we've committed all - // the files, - // Besides, we need to maintain the max-committed-checkpoint-id to be increasing. - if (checkpointId > maxCommittedCheckpointId) { - LOG.info("Checkpoint {} completed. Attempting commit.", checkpointId); - commitUpToCheckpoint(dataFilesPerCheckpoint, flinkJobId, operatorUniqueId, checkpointId); - this.maxCommittedCheckpointId = checkpointId; - } else { - LOG.info( - "Skipping committing checkpoint {}. {} is already committed.", - checkpointId, - maxCommittedCheckpointId); - } - - // reload the table in case new configuration is needed - this.table = tableLoader.loadTable(); - } - - private void commitUpToCheckpoint( - NavigableMap deltaManifestsMap, - String newFlinkJobId, - String operatorId, - long checkpointId) - throws IOException { - NavigableMap pendingMap = deltaManifestsMap.headMap(checkpointId, true); - List manifests = Lists.newArrayList(); - NavigableMap pendingResults = Maps.newTreeMap(); - for (Map.Entry e : pendingMap.entrySet()) { - if (Arrays.equals(EMPTY_MANIFEST_DATA, e.getValue())) { - // Skip the empty flink manifest. - continue; - } - - DeltaManifests deltaManifests = - SimpleVersionedSerialization.readVersionAndDeSerialize( - DeltaManifestsSerializer.INSTANCE, e.getValue()); - pendingResults.put( - e.getKey(), - FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io(), table.specs())); - manifests.addAll(deltaManifests.manifests()); - } - - CommitSummary summary = new CommitSummary(pendingResults); - commitPendingResult(pendingResults, summary, newFlinkJobId, operatorId, checkpointId); - committerMetrics.updateCommitSummary(summary); - pendingMap.clear(); - FlinkManifestUtil.deleteCommittedManifests(table, manifests, newFlinkJobId, checkpointId); - } - - private void commitPendingResult( - NavigableMap pendingResults, - CommitSummary summary, - String newFlinkJobId, - String operatorId, - long checkpointId) { - long totalFiles = summary.dataFilesCount() + summary.deleteFilesCount(); - continuousEmptyCheckpoints = totalFiles == 0 ? continuousEmptyCheckpoints + 1 : 0; - if (totalFiles != 0 || continuousEmptyCheckpoints % maxContinuousEmptyCommits == 0) { - if (replacePartitions) { - replacePartitions(pendingResults, summary, newFlinkJobId, operatorId, checkpointId); - } else { - commitDeltaTxn(pendingResults, summary, newFlinkJobId, operatorId, checkpointId); - } - continuousEmptyCheckpoints = 0; - } else { - LOG.info("Skip commit for checkpoint {} due to no data files or delete files.", checkpointId); - } - } - - private void replacePartitions( - NavigableMap pendingResults, - CommitSummary summary, - String newFlinkJobId, - String operatorId, - long checkpointId) { - Preconditions.checkState( - summary.deleteFilesCount() == 0, "Cannot overwrite partitions with delete files."); - // Commit the overwrite transaction. - ReplacePartitions dynamicOverwrite = table.newReplacePartitions().scanManifestsWith(workerPool); - for (WriteResult result : pendingResults.values()) { - Preconditions.checkState( - result.referencedDataFiles().length == 0, "Should have no referenced data files."); - Arrays.stream(result.dataFiles()).forEach(dynamicOverwrite::addFile); - } - - commitOperation( - dynamicOverwrite, - summary, - "dynamic partition overwrite", - newFlinkJobId, - operatorId, - checkpointId); - } - - private void commitDeltaTxn( - NavigableMap pendingResults, - CommitSummary summary, - String newFlinkJobId, - String operatorId, - long checkpointId) { - if (summary.deleteFilesCount() == 0) { - // To be compatible with iceberg format V1. - AppendFiles appendFiles = table.newAppend().scanManifestsWith(workerPool); - for (WriteResult result : pendingResults.values()) { - Preconditions.checkState( - result.referencedDataFiles().length == 0, - "Should have no referenced data files for append."); - Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); - } - commitOperation(appendFiles, summary, "append", newFlinkJobId, operatorId, checkpointId); - } else { - // To be compatible with iceberg format V2. - for (Map.Entry e : pendingResults.entrySet()) { - // We don't commit the merged result into a single transaction because for the sequential - // transaction txn1 and txn2, the equality-delete files of txn2 are required to be applied - // to data files from txn1. Committing the merged one will lead to the incorrect delete - // semantic. - WriteResult result = e.getValue(); - - // Row delta validations are not needed for streaming changes that write equality deletes. - // Equality deletes are applied to data in all previous sequence numbers, so retries may - // push deletes further in the future, but do not affect correctness. Position deletes - // committed to the table in this path are used only to delete rows from data files that are - // being added in this commit. There is no way for data files added along with the delete - // files to be concurrently removed, so there is no need to validate the files referenced by - // the position delete files that are being committed. - RowDelta rowDelta = table.newRowDelta().scanManifestsWith(workerPool); - - Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows); - Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); - commitOperation(rowDelta, summary, "rowDelta", newFlinkJobId, operatorId, e.getKey()); - } - } - } - - private void commitOperation( - SnapshotUpdate operation, - CommitSummary summary, - String description, - String newFlinkJobId, - String operatorId, - long checkpointId) { - LOG.info( - "Committing {} for checkpoint {} to table {} branch {} with summary: {}", - description, - checkpointId, - table.name(), - branch, - summary); - snapshotProperties.forEach(operation::set); - // custom snapshot metadata properties will be overridden if they conflict with internal ones - // used by the sink. - operation.set(MAX_COMMITTED_CHECKPOINT_ID, Long.toString(checkpointId)); - operation.set(FLINK_JOB_ID, newFlinkJobId); - operation.set(OPERATOR_ID, operatorId); - operation.toBranch(branch); - - long startNano = System.nanoTime(); - operation.commit(); // abort is automatically called if this fails. - long durationMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano); - LOG.info( - "Committed {} to table: {}, branch: {}, checkpointId {} in {} ms", - description, - table.name(), - branch, - checkpointId, - durationMs); - committerMetrics.commitDuration(durationMs); - } - - @Override - public void processElement(StreamRecord element) { - FlinkWriteResult flinkWriteResult = element.getValue(); - List writeResults = - writeResultsSinceLastSnapshot.computeIfAbsent( - flinkWriteResult.checkpointId(), k -> Lists.newArrayList()); - writeResults.add(flinkWriteResult.writeResult()); - } - - @Override - public void endInput() throws IOException { - // Flush the buffered data files into 'dataFilesPerCheckpoint' firstly. - long currentCheckpointId = IcebergStreamWriter.END_INPUT_CHECKPOINT_ID; - writeToManifestUptoLatestCheckpoint(currentCheckpointId); - commitUpToCheckpoint(dataFilesPerCheckpoint, flinkJobId, operatorUniqueId, currentCheckpointId); - } - - private void writeToManifestUptoLatestCheckpoint(long checkpointId) throws IOException { - if (!writeResultsSinceLastSnapshot.containsKey(checkpointId)) { - dataFilesPerCheckpoint.put(checkpointId, EMPTY_MANIFEST_DATA); - } - - for (Map.Entry> writeResultsOfCheckpoint : - writeResultsSinceLastSnapshot.entrySet()) { - dataFilesPerCheckpoint.put( - writeResultsOfCheckpoint.getKey(), - writeToManifest(writeResultsOfCheckpoint.getKey(), writeResultsOfCheckpoint.getValue())); - } - - // Clear the local buffer for current checkpoint. - writeResultsSinceLastSnapshot.clear(); - } - - /** - * Write all the complete data files to a newly created manifest file and return the manifest's - * avro serialized bytes. - */ - private byte[] writeToManifest(long checkpointId, List writeResults) - throws IOException { - WriteResult result = WriteResult.builder().addAll(writeResults).build(); - DeltaManifests deltaManifests = - FlinkManifestUtil.writeCompletedFiles( - result, () -> manifestOutputFileFactory.create(checkpointId), spec); - - return SimpleVersionedSerialization.writeVersionAndSerialize( - DeltaManifestsSerializer.INSTANCE, deltaManifests); - } - - @Override - public void open() throws Exception { - super.open(); - - final String operatorID = getRuntimeContext().getOperatorUniqueID(); - this.workerPool = - ThreadPools.newFixedThreadPool("iceberg-worker-pool-" + operatorID, workerPoolSize); - } - - @Override - public void close() throws Exception { - if (tableLoader != null) { - tableLoader.close(); - } - - if (workerPool != null) { - workerPool.shutdown(); - } - } - - @VisibleForTesting - static ListStateDescriptor> buildStateDescriptor() { - Comparator longComparator = Comparators.forType(Types.LongType.get()); - // Construct a SortedMapTypeInfo. - SortedMapTypeInfo sortedMapTypeInfo = - new SortedMapTypeInfo<>( - BasicTypeInfo.LONG_TYPE_INFO, - PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO, - longComparator); - return new ListStateDescriptor<>("iceberg-files-committer-state", sortedMapTypeInfo); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java deleted file mode 100644 index ce81ef11f13c..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicLong; -import org.apache.flink.annotation.Internal; -import org.apache.flink.metrics.Counter; -import org.apache.flink.metrics.MetricGroup; -import org.apache.iceberg.flink.util.ElapsedTimeGauge; - -@Internal -public class IcebergFilesCommitterMetrics { - private final AtomicLong lastCheckpointDurationMs = new AtomicLong(); - private final AtomicLong lastCommitDurationMs = new AtomicLong(); - private final ElapsedTimeGauge elapsedSecondsSinceLastSuccessfulCommit; - private final Counter committedDataFilesCount; - private final Counter committedDataFilesRecordCount; - private final Counter committedDataFilesByteCount; - private final Counter committedDeleteFilesCount; - private final Counter committedDeleteFilesRecordCount; - private final Counter committedDeleteFilesByteCount; - - public IcebergFilesCommitterMetrics(MetricGroup metrics, String fullTableName) { - MetricGroup committerMetrics = - metrics.addGroup("IcebergFilesCommitter").addGroup("table", fullTableName); - committerMetrics.gauge("lastCheckpointDurationMs", lastCheckpointDurationMs::get); - committerMetrics.gauge("lastCommitDurationMs", lastCommitDurationMs::get); - this.elapsedSecondsSinceLastSuccessfulCommit = new ElapsedTimeGauge(TimeUnit.SECONDS); - committerMetrics.gauge( - "elapsedSecondsSinceLastSuccessfulCommit", elapsedSecondsSinceLastSuccessfulCommit); - this.committedDataFilesCount = committerMetrics.counter("committedDataFilesCount"); - this.committedDataFilesRecordCount = committerMetrics.counter("committedDataFilesRecordCount"); - this.committedDataFilesByteCount = committerMetrics.counter("committedDataFilesByteCount"); - this.committedDeleteFilesCount = committerMetrics.counter("committedDeleteFilesCount"); - this.committedDeleteFilesRecordCount = - committerMetrics.counter("committedDeleteFilesRecordCount"); - this.committedDeleteFilesByteCount = committerMetrics.counter("committedDeleteFilesByteCount"); - } - - public void checkpointDuration(long checkpointDurationMs) { - lastCheckpointDurationMs.set(checkpointDurationMs); - } - - public void commitDuration(long commitDurationMs) { - lastCommitDurationMs.set(commitDurationMs); - } - - /** This is called upon a successful commit. */ - public void updateCommitSummary(CommitSummary stats) { - elapsedSecondsSinceLastSuccessfulCommit.refreshLastRecordedTime(); - committedDataFilesCount.inc(stats.dataFilesCount()); - committedDataFilesRecordCount.inc(stats.dataFilesRecordCount()); - committedDataFilesByteCount.inc(stats.dataFilesByteCount()); - committedDeleteFilesCount.inc(stats.deleteFilesCount()); - committedDeleteFilesRecordCount.inc(stats.deleteFilesRecordCount()); - committedDeleteFilesByteCount.inc(stats.deleteFilesByteCount()); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSink.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSink.java deleted file mode 100644 index 0ea0232278bd..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSink.java +++ /dev/null @@ -1,963 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.time.Duration; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Set; -import java.util.UUID; -import java.util.function.Function; -import org.apache.flink.annotation.Experimental; -import org.apache.flink.api.common.SupportsConcurrentExecutionAttempts; -import org.apache.flink.api.common.functions.FlatMapFunction; -import org.apache.flink.api.common.functions.MapFunction; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.connector.sink2.Committer; -import org.apache.flink.api.connector.sink2.CommitterInitContext; -import org.apache.flink.api.connector.sink2.Sink; -import org.apache.flink.api.connector.sink2.SinkWriter; -import org.apache.flink.api.connector.sink2.SupportsCommitter; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; -import org.apache.flink.streaming.api.connector.sink2.CommittableMessageTypeInfo; -import org.apache.flink.streaming.api.connector.sink2.SupportsPostCommitTopology; -import org.apache.flink.streaming.api.connector.sink2.SupportsPreCommitTopology; -import org.apache.flink.streaming.api.connector.sink2.SupportsPreWriteTopology; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.DataStreamSink; -import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.util.DataFormatConverters; -import org.apache.flink.table.types.DataType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionField; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Partitioning; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.FlinkWriteConf; -import org.apache.iceberg.flink.FlinkWriteOptions; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.maintenance.api.FlinkMaintenanceConfig; -import org.apache.iceberg.flink.maintenance.api.LockConfig; -import org.apache.iceberg.flink.maintenance.api.RewriteDataFiles; -import org.apache.iceberg.flink.maintenance.api.RewriteDataFilesConfig; -import org.apache.iceberg.flink.maintenance.api.TableMaintenance; -import org.apache.iceberg.flink.maintenance.api.TriggerLockFactory; -import org.apache.iceberg.flink.maintenance.operator.LockFactoryBuilder; -import org.apache.iceberg.flink.maintenance.operator.TableChange; -import org.apache.iceberg.flink.sink.shuffle.DataStatisticsOperatorFactory; -import org.apache.iceberg.flink.sink.shuffle.RangePartitioner; -import org.apache.iceberg.flink.sink.shuffle.StatisticsOrRecord; -import org.apache.iceberg.flink.sink.shuffle.StatisticsOrRecordTypeInformation; -import org.apache.iceberg.flink.sink.shuffle.StatisticsType; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.SerializableSupplier; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Flink v2 sink offer different hooks to insert custom topologies into the sink. We will use the - * following: - * - *
      - *
    • {@link SupportsPreWriteTopology} which redistributes the data to the writers based on the - * {@link DistributionMode} - *
    • {@link org.apache.flink.api.connector.sink2.SinkWriter} which writes data/delete files, and - * generates the {@link org.apache.iceberg.io.WriteResult} objects for the files - *
    • {@link SupportsPreCommitTopology} which we use to place the {@link - * org.apache.iceberg.flink.sink.IcebergWriteAggregator} which merges the individual {@link - * org.apache.flink.api.connector.sink2.SinkWriter}'s {@link - * org.apache.iceberg.io.WriteResult}s to a single {@link - * org.apache.iceberg.flink.sink.IcebergCommittable} - *
    • {@link org.apache.iceberg.flink.sink.IcebergCommitter} which commits the incoming{@link - * org.apache.iceberg.flink.sink.IcebergCommittable}s to the Iceberg table - *
    • {@link SupportsPostCommitTopology} we could use for incremental compaction later. This is - * not implemented yet. - *
    - * - * The job graph looks like below: - * - *
    {@code
    - *                            Flink sink
    - *               +-----------------------------------------------------------------------------------+
    - *               |                                                                                   |
    - * +-------+     | +----------+                               +-------------+      +---------------+ |
    - * | Map 1 | ==> | | writer 1 |                               | committer 1 | ---> | post commit 1 | |
    - * +-------+     | +----------+                               +-------------+      +---------------+ |
    - *               |             \                             /                \                      |
    - *               |              \                           /                  \                     |
    - *               |               \                         /                    \                    |
    - * +-------+     | +----------+   \ +-------------------+ /   +-------------+    \ +---------------+ |
    - * | Map 2 | ==> | | writer 2 | --->| commit aggregator |     | committer 2 |      | post commit 2 | |
    - * +-------+     | +----------+     +-------------------+     +-------------+      +---------------+ |
    - *               |                                             Commit only on                        |
    - *               |                                             committer 1                           |
    - *               +-----------------------------------------------------------------------------------+
    - * }
    - */ -@Experimental -public class IcebergSink - implements Sink, - SupportsPreWriteTopology, - SupportsCommitter, - SupportsPreCommitTopology, - SupportsPostCommitTopology, - SupportsConcurrentExecutionAttempts { - private static final Logger LOG = LoggerFactory.getLogger(IcebergSink.class); - private final TableLoader tableLoader; - private final Map snapshotProperties; - private final String uidSuffix; - private final String sinkId; - private final Map writeProperties; - private final RowType flinkRowType; - private final SerializableSupplier
    tableSupplier; - private final transient FlinkWriteConf flinkWriteConf; - private final Set equalityFieldIds; - private final boolean upsertMode; - private final FileFormat dataFileFormat; - private final long targetDataFileSize; - private final String branch; - private final boolean overwriteMode; - private final int workerPoolSize; - private final boolean compactMode; - private final transient FlinkMaintenanceConfig flinkMaintenanceConfig; - - private final Table table; - private final Set equalityFieldColumns = null; - - private IcebergSink( - TableLoader tableLoader, - Table table, - Map snapshotProperties, - String uidSuffix, - Map writeProperties, - RowType flinkRowType, - SerializableSupplier
    tableSupplier, - FlinkWriteConf flinkWriteConf, - Set equalityFieldIds, - String branch, - boolean overwriteMode, - FlinkMaintenanceConfig flinkMaintenanceConfig) { - this.tableLoader = tableLoader; - this.snapshotProperties = snapshotProperties; - this.uidSuffix = uidSuffix; - this.writeProperties = writeProperties; - this.flinkRowType = flinkRowType; - this.tableSupplier = tableSupplier; - this.flinkWriteConf = flinkWriteConf; - this.equalityFieldIds = equalityFieldIds; - this.branch = branch; - this.overwriteMode = overwriteMode; - this.table = table; - this.upsertMode = flinkWriteConf.upsertMode(); - this.dataFileFormat = flinkWriteConf.dataFileFormat(); - this.targetDataFileSize = flinkWriteConf.targetDataFileSize(); - this.workerPoolSize = flinkWriteConf.workerPoolSize(); - // We generate a random UUID every time when a sink is created. - // This is used to separate files generated by different sinks writing the same table. - // Also used to generate the aggregator operator name - this.sinkId = UUID.randomUUID().toString(); - this.compactMode = flinkWriteConf.compactMode(); - this.flinkMaintenanceConfig = flinkMaintenanceConfig; - } - - @Override - public SinkWriter createWriter(InitContext context) { - RowDataTaskWriterFactory taskWriterFactory = - new RowDataTaskWriterFactory( - tableSupplier, - flinkRowType, - targetDataFileSize, - dataFileFormat, - writeProperties, - equalityFieldIds, - upsertMode); - IcebergStreamWriterMetrics metrics = - new IcebergStreamWriterMetrics(context.metricGroup(), table.name()); - return new IcebergSinkWriter( - tableSupplier.get().name(), - taskWriterFactory, - metrics, - context.getSubtaskId(), - context.getAttemptNumber()); - } - - @Override - public Committer createCommitter(CommitterInitContext context) { - IcebergFilesCommitterMetrics metrics = - new IcebergFilesCommitterMetrics(context.metricGroup(), table.name()); - return new IcebergCommitter( - tableLoader, - branch, - snapshotProperties, - overwriteMode, - workerPoolSize, - sinkId, - metrics, - compactMode); - } - - @Override - public SimpleVersionedSerializer getCommittableSerializer() { - return new IcebergCommittableSerializer(); - } - - @Override - public void addPostCommitTopology( - DataStream> committables) { - - if (!compactMode) { - return; - } - - String suffix = defaultSuffix(uidSuffix, table.name()); - String postCommitUid = String.format("Sink post-commit : %s", suffix); - - SingleOutputStreamOperator tableChangeStream = - committables - .global() - .process(new CommittableToTableChangeConverter(table.io(), table.name(), table.specs())) - .uid(postCommitUid) - .forceNonParallel(); - try { - RewriteDataFilesConfig rewriteDataFilesConfig = - flinkMaintenanceConfig.createRewriteDataFilesConfig(); - RewriteDataFiles.Builder rewriteBuilder = - RewriteDataFiles.builder().config(rewriteDataFilesConfig); - - LockConfig lockConfig = flinkMaintenanceConfig.createLockConfig(); - TriggerLockFactory triggerLockFactory = LockFactoryBuilder.build(lockConfig, table.name()); - String tableMaintenanceUid = String.format("TableMaintenance : %s", suffix); - TableMaintenance.Builder builder = - TableMaintenance.forChangeStream(tableChangeStream, tableLoader, triggerLockFactory) - .uidSuffix(tableMaintenanceUid) - .add(rewriteBuilder); - - builder - .rateLimit(Duration.ofSeconds(flinkMaintenanceConfig.rateLimit())) - .lockCheckDelay(Duration.ofSeconds(flinkMaintenanceConfig.lockCheckDelay())) - .slotSharingGroup(flinkMaintenanceConfig.slotSharingGroup()) - .parallelism(flinkMaintenanceConfig.parallelism()) - .append(); - } catch (IOException e) { - throw new UncheckedIOException("Failed to create tableMaintenance ", e); - } - } - - @Override - public DataStream addPreWriteTopology(DataStream inputDataStream) { - return distributeDataStream(inputDataStream); - } - - @Override - public DataStream> addPreCommitTopology( - DataStream> writeResults) { - TypeInformation> typeInformation = - CommittableMessageTypeInfo.of(this::getCommittableSerializer); - - String suffix = defaultSuffix(uidSuffix, table.name()); - String preCommitAggregatorUid = String.format("Sink pre-commit aggregator: %s", suffix); - - // global forces all output records send to subtask 0 of the downstream committer operator. - // This is to ensure commit only happen in one committer subtask. - // Once upstream Flink provides the capability of setting committer operator - // parallelism to 1, this can be removed. - return writeResults - .global() - .transform(preCommitAggregatorUid, typeInformation, new IcebergWriteAggregator(tableLoader)) - .uid(preCommitAggregatorUid) - .setParallelism(1) - .setMaxParallelism(1) - // global forces all output records send to subtask 0 of the downstream committer operator. - // This is to ensure commit only happen in one committer subtask. - // Once upstream Flink provides the capability of setting committer operator - // parallelism to 1, this can be removed. - .global(); - } - - @Override - public SimpleVersionedSerializer getWriteResultSerializer() { - return new WriteResultSerializer(); - } - - public static class Builder { - private TableLoader tableLoader; - private String uidSuffix = ""; - private Function> inputCreator = null; - @Deprecated private TableSchema tableSchema; - private ResolvedSchema resolvedSchema; - private SerializableTable table; - private final Map writeOptions = Maps.newHashMap(); - private final Map snapshotSummary = Maps.newHashMap(); - private ReadableConfig readableConfig = new Configuration(); - private List equalityFieldColumns = null; - - private Builder() {} - - private Builder forRowData(DataStream newRowDataInput) { - this.inputCreator = ignored -> newRowDataInput; - return this; - } - - /** - * Clean up after removing {@link IcebergSink#forRow(DataStream, TableSchema)} - * - * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link #forRow(DataStream, - * ResolvedSchema)} instead. - */ - @Deprecated - private Builder forRow(DataStream input, TableSchema inputTableSchema) { - RowType rowType = (RowType) inputTableSchema.toRowDataType().getLogicalType(); - DataType[] fieldDataTypes = inputTableSchema.getFieldDataTypes(); - - DataFormatConverters.RowConverter rowConverter = - new DataFormatConverters.RowConverter(fieldDataTypes); - return forMapperOutputType( - input, rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)) - .tableSchema(inputTableSchema); - } - - private Builder forRow(DataStream input, ResolvedSchema inputResolvedSchema) { - RowType rowType = (RowType) inputResolvedSchema.toSinkRowDataType().getLogicalType(); - DataType[] fieldDataTypes = inputResolvedSchema.getColumnDataTypes().toArray(DataType[]::new); - - DataFormatConverters.RowConverter rowConverter = - new DataFormatConverters.RowConverter(fieldDataTypes); - return forMapperOutputType( - input, rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)) - .resolvedSchema(inputResolvedSchema); - } - - private Builder forMapperOutputType( - DataStream input, MapFunction mapper, TypeInformation outputType) { - this.inputCreator = - newUidSuffix -> { - // Input stream order is crucial for some situation(e.g. in cdc case). Therefore, we - // need to set the parallelism of map operator same as its input to keep map operator - // chaining its input, and avoid rebalanced by default. - SingleOutputStreamOperator inputStream = - input.map(mapper, outputType).setParallelism(input.getParallelism()); - if (newUidSuffix != null) { - String uid = String.format("Sink pre-writer mapper: %s", newUidSuffix); - inputStream.name(uid).uid(uid); - } - return inputStream; - }; - return this; - } - - /** - * This iceberg {@link SerializableTable} instance is used for initializing {@link - * IcebergStreamWriter} which will write all the records into {@link DataFile}s and emit them to - * downstream operator. Providing a table would avoid so many table loading from each separate - * task. - * - * @param newTable the loaded iceberg table instance. - * @return {@link IcebergSink.Builder} to connect the iceberg table. - */ - public Builder table(Table newTable) { - this.table = (SerializableTable) SerializableTable.copyOf(newTable); - return this; - } - - /** - * The table loader is used for loading tables in {@link - * org.apache.iceberg.flink.sink.IcebergCommitter} lazily, we need this loader because {@link - * Table} is not serializable and could not just use the loaded table from Builder#table in the - * remote task manager. - * - * @param newTableLoader to load iceberg table inside tasks. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder tableLoader(TableLoader newTableLoader) { - this.tableLoader = newTableLoader; - return this; - } - - TableLoader tableLoader() { - return tableLoader; - } - - /** - * Set the write properties for IcebergSink. View the supported properties in {@link - * FlinkWriteOptions} - */ - public Builder set(String property, String value) { - writeOptions.put(property, value); - return this; - } - - /** - * Set the write properties for IcebergSink. View the supported properties in {@link - * FlinkWriteOptions} - */ - public Builder setAll(Map properties) { - writeOptions.putAll(properties); - return this; - } - - /** - * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link - * #resolvedSchema(ResolvedSchema)} instead. - */ - @Deprecated - public Builder tableSchema(TableSchema newTableSchema) { - this.tableSchema = newTableSchema; - return this; - } - - public Builder resolvedSchema(ResolvedSchema newResolvedSchema) { - this.resolvedSchema = newResolvedSchema; - return this; - } - - public Builder overwrite(boolean newOverwrite) { - writeOptions.put(FlinkWriteOptions.OVERWRITE_MODE.key(), Boolean.toString(newOverwrite)); - return this; - } - - public Builder flinkConf(ReadableConfig config) { - this.readableConfig = config; - return this; - } - - /** - * Configure the write {@link DistributionMode} that the IcebergSink will use. Currently, flink - * support {@link DistributionMode#NONE} and {@link DistributionMode#HASH} and {@link - * DistributionMode#RANGE} - * - * @param mode to specify the write distribution mode. - * @return {@link IcebergSink.Builder} to connect the iceberg table. - */ - public Builder distributionMode(DistributionMode mode) { - if (mode != null) { - writeOptions.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), mode.modeName()); - } - return this; - } - - /** - * Range distribution needs to collect statistics about data distribution to properly shuffle - * the records in relatively balanced way. In general, low cardinality should use {@link - * StatisticsType#Map} and high cardinality should use {@link StatisticsType#Sketch} Refer to - * {@link StatisticsType} Javadoc for more details. - * - *

    Default is {@link StatisticsType#Auto} where initially Map statistics is used. But if - * cardinality is higher than the threshold (currently 10K) as defined in {@code - * SketchUtil#OPERATOR_SKETCH_SWITCH_THRESHOLD}, statistics collection automatically switches to - * the sketch reservoir sampling. - * - *

    Explicit set the statistics type if the default behavior doesn't work. - * - * @param type to specify the statistics type for range distribution. - * @return {@link IcebergSink.Builder} to connect the iceberg table. - */ - public IcebergSink.Builder rangeDistributionStatisticsType(StatisticsType type) { - if (type != null) { - writeOptions.put(FlinkWriteOptions.RANGE_DISTRIBUTION_STATISTICS_TYPE.key(), type.name()); - } - return this; - } - - /** - * If sort order contains partition columns, each sort key would map to one partition and data - * file. This relative weight can avoid placing too many small files for sort keys with low - * traffic. It is a double value that defines the minimal weight for each sort key. `0.02` means - * each key has a base weight of `2%` of the targeted traffic weight per writer task. - * - *

    E.g. the sink Iceberg table is partitioned daily by event time. Assume the data stream - * contains events from now up to 180 days ago. With event time, traffic weight distribution - * across different days typically has a long tail pattern. Current day contains the most - * traffic. The older days (long tail) contain less and less traffic. Assume writer parallelism - * is `10`. The total weight across all 180 days is `10,000`. Target traffic weight per writer - * task would be `1,000`. Assume the weight sum for the oldest 150 days is `1,000`. Normally, - * the range partitioner would put all the oldest 150 days in one writer task. That writer task - * would write to 150 small files (one per day). Keeping 150 open files can potentially consume - * large amount of memory. Flushing and uploading 150 files (however small) at checkpoint time - * can also be potentially slow. If this config is set to `0.02`. It means every sort key has a - * base weight of `2%` of targeted weight of `1,000` for every write task. It would essentially - * avoid placing more than `50` data files (one per day) on one writer task no matter how small - * they are. - * - *

    This is only applicable to {@link StatisticsType#Map} for low-cardinality scenario. For - * {@link StatisticsType#Sketch} high-cardinality sort columns, they are usually not used as - * partition columns. Otherwise, too many partitions and small files may be generated during - * write. Sketch range partitioner simply splits high-cardinality keys into ordered ranges. - * - *

    Default is {@code 0.0%}. - */ - public Builder rangeDistributionSortKeyBaseWeight(double weight) { - writeOptions.put( - FlinkWriteOptions.RANGE_DISTRIBUTION_SORT_KEY_BASE_WEIGHT.key(), Double.toString(weight)); - return this; - } - - /** - * Configuring the write parallel number for iceberg stream writer. - * - * @param newWriteParallelism the number of parallel iceberg stream writer. - * @return {@link IcebergSink.Builder} to connect the iceberg table. - */ - public Builder writeParallelism(int newWriteParallelism) { - writeOptions.put( - FlinkWriteOptions.WRITE_PARALLELISM.key(), Integer.toString(newWriteParallelism)); - return this; - } - - /** - * All INSERT/UPDATE_AFTER events from input stream will be transformed to UPSERT events, which - * means it will DELETE the old records and then INSERT the new records. In partitioned table, - * the partition fields should be a subset of equality fields, otherwise the old row that - * located in partition-A could not be deleted by the new row that located in partition-B. - * - * @param enabled indicate whether it should transform all INSERT/UPDATE_AFTER events to UPSERT. - * @return {@link IcebergSink.Builder} to connect the iceberg table. - */ - public Builder upsert(boolean enabled) { - writeOptions.put(FlinkWriteOptions.WRITE_UPSERT_ENABLED.key(), Boolean.toString(enabled)); - return this; - } - - /** - * Configuring the equality field columns for iceberg table that accept CDC or UPSERT events. - * - * @param columns defines the iceberg table's key. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder equalityFieldColumns(List columns) { - this.equalityFieldColumns = columns; - return this; - } - - /** - * Set the uid suffix for IcebergSink operators. Note that IcebergSink internally consists of - * multiple operators (like writer, committer, aggregator). Actual operator uid will be appended - * with a suffix like "Sink Committer: $uidSuffix". - * - *

    Flink auto generates operator uid if not set explicitly. It is a recommended - * best-practice to set uid for all operators before deploying to production. Flink has an - * option to {@code pipeline.auto-generate-uid=false} to disable auto-generation and force - * explicit setting of all operator uid. - * - *

    Be careful with setting this for an existing job, because now we are changing the operator - * uid from an auto-generated one to this new value. When deploying the change with a - * checkpoint, Flink won't be able to restore the previous IcebergSink operator state (more - * specifically the committer operator state). You need to use {@code --allowNonRestoredState} - * to ignore the previous sink state. During restore IcebergSink state is used to check if last - * commit was actually successful or not. {@code --allowNonRestoredState} can lead to data loss - * if the Iceberg commit failed in the last completed checkpoint. - * - * @param newSuffix suffix for Flink sink operator uid and name - * @return {@link Builder} to connect the iceberg table. - */ - public Builder uidSuffix(String newSuffix) { - this.uidSuffix = newSuffix; - return this; - } - - public Builder snapshotProperties(Map properties) { - snapshotSummary.putAll(properties); - return this; - } - - public Builder setSnapshotProperty(String property, String value) { - snapshotSummary.put(property, value); - return this; - } - - public Builder toBranch(String branch) { - writeOptions.put(FlinkWriteOptions.BRANCH.key(), branch); - return this; - } - - IcebergSink build() { - - Preconditions.checkArgument( - inputCreator != null, - "Please use forRowData() or forMapperOutputType() to initialize the input DataStream."); - Preconditions.checkNotNull(tableLoader(), "Table loader shouldn't be null"); - - // Set the table if it is not yet set in the builder, so we can do the equalityId checks - SerializableTable serializableTable = checkAndGetTable(tableLoader(), table); - this.table = serializableTable; - // Init the `flinkWriteConf` here, so we can do the checks - FlinkWriteConf flinkWriteConf = new FlinkWriteConf(table, writeOptions, readableConfig); - - Duration tableRefreshInterval = flinkWriteConf.tableRefreshInterval(); - SerializableSupplier

    tableSupplier; - if (tableRefreshInterval != null) { - tableSupplier = new CachingTableSupplier(table, tableLoader(), tableRefreshInterval); - } else { - tableSupplier = () -> serializableTable; - } - - boolean overwriteMode = flinkWriteConf.overwriteMode(); - - // Validate the equality fields and partition fields if we enable the upsert mode. - Set equalityFieldIds = - SinkUtil.checkAndGetEqualityFieldIds(table, equalityFieldColumns); - - if (flinkWriteConf.upsertMode()) { - Preconditions.checkState( - !overwriteMode, - "OVERWRITE mode shouldn't be enable when configuring to use UPSERT data stream."); - Preconditions.checkState( - !equalityFieldIds.isEmpty(), - "Equality field columns shouldn't be empty when configuring to use UPSERT data stream."); - if (!table.spec().isUnpartitioned()) { - for (PartitionField partitionField : table.spec().fields()) { - Preconditions.checkState( - equalityFieldIds.contains(partitionField.sourceId()), - "In 'hash' distribution mode with equality fields set, source column '%s' of partition field '%s' " - + "should be included in equality fields: '%s'", - table.schema().findColumnName(partitionField.sourceId()), - partitionField, - equalityFieldColumns); - } - } - } - - FlinkMaintenanceConfig flinkMaintenanceConfig = - new FlinkMaintenanceConfig(table, writeOptions, readableConfig); - return new IcebergSink( - tableLoader, - table, - snapshotSummary, - uidSuffix, - SinkUtil.writeProperties(flinkWriteConf.dataFileFormat(), flinkWriteConf, table), - resolvedSchema != null - ? toFlinkRowType(table.schema(), resolvedSchema) - : toFlinkRowType(table.schema(), tableSchema), - tableSupplier, - flinkWriteConf, - equalityFieldIds, - flinkWriteConf.branch(), - overwriteMode, - flinkMaintenanceConfig); - } - - /** - * Append the iceberg sink operators to write records to iceberg table. - * - * @return {@link DataStreamSink} for sink. - */ - public DataStreamSink append() { - IcebergSink sink = build(); - String suffix = defaultSuffix(uidSuffix, table.name()); - DataStream rowDataInput = inputCreator.apply(suffix); - // Please note that V2 sink framework will apply the uid here to the framework created - // operators like writer, - // committer. E.g. "Sink writer: - DataStreamSink rowDataDataStreamSink = - rowDataInput.sinkTo(sink).uid(suffix).name(suffix); - - // Note that IcebergSink internally consists o multiple operators (like writer, committer, - // aggregator). - // The following parallelism will be propagated to all of the above operators. - rowDataDataStreamSink.setParallelism(sink.resolveWriterParallelism(rowDataInput)); - return rowDataDataStreamSink; - } - } - - private String operatorName(String suffix) { - return uidSuffix != null ? suffix + "-" + uidSuffix : suffix; - } - - private static String defaultSuffix(String uidSuffix, String defaultSuffix) { - if (uidSuffix == null || uidSuffix.isEmpty()) { - return defaultSuffix; - } - return uidSuffix; - } - - private static SerializableTable checkAndGetTable(TableLoader tableLoader, Table table) { - if (table == null) { - if (!tableLoader.isOpen()) { - tableLoader.open(); - } - - try (TableLoader loader = tableLoader) { - return (SerializableTable) SerializableTable.copyOf(loader.loadTable()); - } catch (IOException e) { - throw new UncheckedIOException( - "Failed to load iceberg table from table loader: " + tableLoader, e); - } - } - - return (SerializableTable) SerializableTable.copyOf(table); - } - - /** - * Clean up after removing {@link Builder#tableSchema} - * - * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link #toFlinkRowType(Schema, - * ResolvedSchema)} instead. - */ - @Deprecated - private static RowType toFlinkRowType(Schema schema, TableSchema requestedSchema) { - if (requestedSchema != null) { - // Convert the flink schema to iceberg schema firstly, then reassign ids to match the existing - // iceberg schema. - Schema writeSchema = TypeUtil.reassignIds(FlinkSchemaUtil.convert(requestedSchema), schema); - TypeUtil.validateWriteSchema(schema, writeSchema, true, true); - - // We use this flink schema to read values from RowData. The flink's TINYINT and SMALLINT will - // be promoted to iceberg INTEGER, that means if we use iceberg's table schema to read TINYINT - // (backend by 1 'byte'), we will read 4 bytes rather than 1 byte, it will mess up the byte - // array in BinaryRowData. So here we must use flink schema. - return (RowType) requestedSchema.toRowDataType().getLogicalType(); - } else { - return FlinkSchemaUtil.convert(schema); - } - } - - private static RowType toFlinkRowType(Schema schema, ResolvedSchema requestedSchema) { - if (requestedSchema != null) { - // Convert the flink schema to iceberg schema firstly, then reassign ids to match the existing - // iceberg schema. - Schema writeSchema = TypeUtil.reassignIds(FlinkSchemaUtil.convert(requestedSchema), schema); - TypeUtil.validateWriteSchema(schema, writeSchema, true, true); - - // We use this flink schema to read values from RowData. The flink's TINYINT and SMALLINT will - // be promoted to iceberg INTEGER, that means if we use iceberg's table schema to read TINYINT - // (backend by 1 'byte'), we will read 4 bytes rather than 1 byte, it will mess up the byte - // array in BinaryRowData. So here we must use flink schema. - return (RowType) requestedSchema.toSinkRowDataType().getLogicalType(); - } else { - return FlinkSchemaUtil.convert(schema); - } - } - - private DataStream distributeDataStream(DataStream input) { - DistributionMode mode = flinkWriteConf.distributionMode(); - Schema schema = table.schema(); - PartitionSpec spec = table.spec(); - SortOrder sortOrder = table.sortOrder(); - - LOG.info("Write distribution mode is '{}'", mode.modeName()); - switch (mode) { - case NONE: - return distributeDataStreamByNoneDistributionMode(input, schema); - case HASH: - return distributeDataStreamByHashDistributionMode(input, schema, spec); - case RANGE: - return distributeDataStreamByRangeDistributionMode(input, schema, spec, sortOrder); - default: - throw new RuntimeException("Unrecognized " + WRITE_DISTRIBUTION_MODE + ": " + mode); - } - } - - private DataStream distributeDataStreamByNoneDistributionMode( - DataStream input, Schema iSchema) { - if (equalityFieldIds.isEmpty()) { - return input; - } else { - LOG.info("Distribute rows by equality fields, because there are equality fields set"); - return input.keyBy(new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); - } - } - - private DataStream distributeDataStreamByHashDistributionMode( - DataStream input, Schema iSchema, PartitionSpec partitionSpec) { - if (equalityFieldIds.isEmpty()) { - if (partitionSpec.isUnpartitioned()) { - LOG.warn( - "Fallback to use 'none' distribution mode, because there are no equality fields set " - + "and table is unpartitioned"); - return input; - } else { - return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); - } - } else { - if (partitionSpec.isUnpartitioned()) { - LOG.info( - "Distribute rows by equality fields, because there are equality fields set " - + "and table is unpartitioned"); - return input.keyBy(new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); - } else { - for (PartitionField partitionField : partitionSpec.fields()) { - Preconditions.checkState( - equalityFieldIds.contains(partitionField.sourceId()), - "In 'hash' distribution mode with equality fields set, source column '%s' of partition field '%s' " - + "should be included in equality fields: '%s'", - table.schema().findColumnName(partitionField.sourceId()), - partitionField, - equalityFieldColumns); - } - return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); - } - } - } - - private int resolveWriterParallelism(DataStream input) { - // if the writeParallelism is not specified, we set the default to the input parallelism to - // encourage chaining. - return Optional.ofNullable(flinkWriteConf.writeParallelism()).orElseGet(input::getParallelism); - } - - private DataStream distributeDataStreamByRangeDistributionMode( - DataStream input, - Schema iSchema, - PartitionSpec partitionSpec, - SortOrder sortOrderParam) { - - int writerParallelism = resolveWriterParallelism(input); - - // needed because of checkStyle not allowing us to change the value of an argument - SortOrder sortOrder = sortOrderParam; - - // Ideally, exception should be thrown in the combination of range distribution and - // equality fields. Primary key case should use hash distribution mode. - // Keep the current behavior of falling back to keyBy for backward compatibility. - if (!equalityFieldIds.isEmpty()) { - LOG.warn( - "Hash distribute rows by equality fields, even though {}=range is set. " - + "Range distribution for primary keys are not always safe in " - + "Flink streaming writer.", - WRITE_DISTRIBUTION_MODE); - return input.keyBy(new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); - } - - // range distribute by partition key or sort key if table has an SortOrder - Preconditions.checkState( - sortOrder.isSorted() || partitionSpec.isPartitioned(), - "Invalid write distribution mode: range. Need to define sort order or partition spec."); - if (sortOrder.isUnsorted()) { - sortOrder = Partitioning.sortOrderFor(partitionSpec); - LOG.info("Construct sort order from partition spec"); - } - - LOG.info("Range distribute rows by sort order: {}", sortOrder); - StatisticsOrRecordTypeInformation statisticsOrRecordTypeInformation = - new StatisticsOrRecordTypeInformation(flinkRowType, iSchema, sortOrder); - StatisticsType statisticsType = flinkWriteConf.rangeDistributionStatisticsType(); - SingleOutputStreamOperator shuffleStream = - input - .transform( - operatorName("range-shuffle"), - statisticsOrRecordTypeInformation, - new DataStatisticsOperatorFactory( - iSchema, - sortOrder, - writerParallelism, - statisticsType, - flinkWriteConf.rangeDistributionSortKeyBaseWeight())) - // Set the parallelism same as input operator to encourage chaining - .setParallelism(input.getParallelism()); - - if (uidSuffix != null) { - shuffleStream = shuffleStream.uid("shuffle-" + uidSuffix); - } - - return shuffleStream - .partitionCustom(new RangePartitioner(iSchema, sortOrder), r -> r) - .flatMap( - (FlatMapFunction) - (statisticsOrRecord, out) -> { - if (statisticsOrRecord.hasRecord()) { - out.collect(statisticsOrRecord.record()); - } - }) - // Set slot sharing group and the parallelism same as writerParallelism to - // promote operator chaining with the downstream writer operator - .slotSharingGroup("shuffle-partition-custom-group") - .setParallelism(writerParallelism) - .returns(RowData.class); - } - - /** - * Initialize a {@link Builder} to export the data from generic input data stream into iceberg - * table. We use {@link RowData} inside the sink connector, so users need to provide a mapper - * function and a {@link TypeInformation} to convert those generic records to a RowData - * DataStream. - * - * @param input the generic source input data stream. - * @param mapper function to convert the generic data to {@link RowData} - * @param outputType to define the {@link TypeInformation} for the input data. - * @param the data type of records. - * @return {@link Builder} to connect the iceberg table. - */ - public static Builder builderFor( - DataStream input, MapFunction mapper, TypeInformation outputType) { - return new Builder().forMapperOutputType(input, mapper, outputType); - } - - /** - * Initialize a {@link Builder} to export the data from input data stream with {@link Row}s into - * iceberg table. We use {@link RowData} inside the sink connector, so users need to provide a - * {@link TableSchema} for builder to convert those {@link Row}s to a {@link RowData} DataStream. - * - * @param input the source input data stream with {@link Row}s. - * @param tableSchema defines the {@link TypeInformation} for input data. - * @return {@link Builder} to connect the iceberg table. - * @deprecated Use {@link #forRow(DataStream, ResolvedSchema)} instead. - */ - @Deprecated - public static Builder forRow(DataStream input, TableSchema tableSchema) { - return new Builder().forRow(input, tableSchema); - } - - /** - * Initialize a {@link Builder} to export the data from input data stream with {@link Row}s into - * iceberg table. We use {@link RowData} inside the sink connector, so users need to provide a - * {@link ResolvedSchema} for builder to convert those {@link Row}s to a {@link RowData} - * DataStream. - * - * @param input the source input data stream with {@link Row}s. - * @param resolvedSchema defines the {@link TypeInformation} for input data. - * @return {@link Builder} to connect the iceberg table. - */ - public static Builder forRow(DataStream input, ResolvedSchema resolvedSchema) { - return new Builder().forRow(input, resolvedSchema); - } - - /** - * Initialize a {@link Builder} to export the data from input data stream with {@link RowData}s - * into iceberg table. - * - * @param input the source input data stream with {@link RowData}s. - * @return {@link Builder} to connect the iceberg table. - */ - public static Builder forRowData(DataStream input) { - return new Builder().forRowData(input); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkWriter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkWriter.java deleted file mode 100644 index 7234cf74020e..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSinkWriter.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.util.Collection; -import java.util.concurrent.TimeUnit; -import org.apache.flink.api.connector.sink2.CommittingSinkWriter; -import org.apache.flink.api.connector.sink2.SinkWriter; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Iceberg writer implementation for the {@link SinkWriter} interface. Used by the {@link - * org.apache.iceberg.flink.sink.IcebergSink} (SinkV2). Writes out the data to the final place, and - * emits a single {@link WriteResult} at every checkpoint for every data/delete file created by this - * writer. - */ -class IcebergSinkWriter implements CommittingSinkWriter { - private static final Logger LOG = LoggerFactory.getLogger(IcebergSinkWriter.class); - - private final String fullTableName; - private final TaskWriterFactory taskWriterFactory; - private final IcebergStreamWriterMetrics metrics; - private TaskWriter writer; - private final int subTaskId; - private final int attemptId; - - IcebergSinkWriter( - String fullTableName, - TaskWriterFactory taskWriterFactory, - IcebergStreamWriterMetrics metrics, - int subTaskId, - int attemptId) { - this.fullTableName = fullTableName; - this.taskWriterFactory = taskWriterFactory; - // Initialize the task writer factory. - taskWriterFactory.initialize(subTaskId, attemptId); - // Initialize the task writer. - this.writer = taskWriterFactory.create(); - this.metrics = metrics; - this.subTaskId = subTaskId; - this.attemptId = attemptId; - LOG.debug( - "Created Stream Writer for table {} subtask {} attemptId {}", - fullTableName, - subTaskId, - attemptId); - } - - @Override - public void write(RowData element, Context context) throws IOException, InterruptedException { - writer.write(element); - } - - @Override - public void flush(boolean endOfInput) { - // flush is used to handle flush/endOfInput, so no action is taken here. - } - - @Override - public void close() throws Exception { - if (writer != null) { - writer.close(); - } - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("tableName", fullTableName) - .add("subTaskId", subTaskId) - .add("attemptId", attemptId) - .toString(); - } - - @Override - public Collection prepareCommit() throws IOException { - long startNano = System.nanoTime(); - WriteResult result = writer.complete(); - this.writer = taskWriterFactory.create(); - metrics.updateFlushResult(result); - metrics.flushDuration(TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano)); - LOG.debug( - "Iceberg writer subtask {} attempt {} flushed {} data files and {} delete files", - subTaskId, - attemptId, - result.dataFiles().length, - result.deleteFiles().length); - return Lists.newArrayList(result); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java deleted file mode 100644 index 412d6c7081bf..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.util.concurrent.TimeUnit; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.BoundedOneInput; -import org.apache.flink.streaming.api.operators.ChainingStrategy; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; - -class IcebergStreamWriter extends AbstractStreamOperator - implements OneInputStreamOperator, BoundedOneInput { - - private static final long serialVersionUID = 1L; - static final long END_INPUT_CHECKPOINT_ID = Long.MAX_VALUE; - - private final String fullTableName; - private final TaskWriterFactory taskWriterFactory; - - private transient TaskWriter writer; - private transient int subTaskId; - private transient int attemptId; - private transient IcebergStreamWriterMetrics writerMetrics; - - IcebergStreamWriter(String fullTableName, TaskWriterFactory taskWriterFactory) { - this.fullTableName = fullTableName; - this.taskWriterFactory = taskWriterFactory; - setChainingStrategy(ChainingStrategy.ALWAYS); - } - - @Override - public void open() { - this.subTaskId = getRuntimeContext().getIndexOfThisSubtask(); - this.attemptId = getRuntimeContext().getAttemptNumber(); - this.writerMetrics = new IcebergStreamWriterMetrics(super.metrics, fullTableName); - - // Initialize the task writer factory. - this.taskWriterFactory.initialize(subTaskId, attemptId); - - // Initialize the task writer. - this.writer = taskWriterFactory.create(); - } - - @Override - public void prepareSnapshotPreBarrier(long checkpointId) throws Exception { - flush(checkpointId); - this.writer = taskWriterFactory.create(); - } - - @Override - public void processElement(StreamRecord element) throws Exception { - writer.write(element.getValue()); - } - - @Override - public void close() throws Exception { - super.close(); - if (writer != null) { - writer.close(); - writer = null; - } - } - - @Override - public void endInput() throws IOException { - // For bounded stream, it may don't enable the checkpoint mechanism so we'd better to emit the - // remaining completed files to downstream before closing the writer so that we won't miss any - // of them. - // Note that if the task is not closed after calling endInput, checkpoint may be triggered again - // causing files to be sent repeatedly, the writer is marked as null after the last file is sent - // to guard against duplicated writes. - flush(END_INPUT_CHECKPOINT_ID); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("tableName", fullTableName) - .add("subTaskId", subTaskId) - .add("attemptId", attemptId) - .toString(); - } - - /** close all open files and emit files to downstream committer operator */ - private void flush(long checkpointId) throws IOException { - if (writer == null) { - return; - } - - long startNano = System.nanoTime(); - WriteResult result = writer.complete(); - writerMetrics.updateFlushResult(result); - output.collect(new StreamRecord<>(new FlinkWriteResult(checkpointId, result))); - writerMetrics.flushDuration(TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano)); - - // Set writer to null to prevent duplicate flushes in the corner case of - // prepareSnapshotPreBarrier happening after endInput. - writer = null; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java deleted file mode 100644 index 434f3969577f..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import com.codahale.metrics.SlidingWindowReservoir; -import java.util.Arrays; -import java.util.concurrent.atomic.AtomicLong; -import org.apache.flink.annotation.Internal; -import org.apache.flink.dropwizard.metrics.DropwizardHistogramWrapper; -import org.apache.flink.metrics.Counter; -import org.apache.flink.metrics.Histogram; -import org.apache.flink.metrics.MetricGroup; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.util.ScanTaskUtil; - -@Internal -public class IcebergStreamWriterMetrics { - // 1,024 reservoir size should cost about 8KB, which is quite small. - // It should also produce good accuracy for histogram distribution (like percentiles). - private static final int HISTOGRAM_RESERVOIR_SIZE = 1024; - - private final Counter flushedDataFiles; - private final Counter flushedDeleteFiles; - private final Counter flushedReferencedDataFiles; - private final AtomicLong lastFlushDurationMs; - private final Histogram dataFilesSizeHistogram; - private final Histogram deleteFilesSizeHistogram; - - public IcebergStreamWriterMetrics(MetricGroup metrics, String fullTableName) { - MetricGroup writerMetrics = - metrics.addGroup("IcebergStreamWriter").addGroup("table", fullTableName); - this.flushedDataFiles = writerMetrics.counter("flushedDataFiles"); - this.flushedDeleteFiles = writerMetrics.counter("flushedDeleteFiles"); - this.flushedReferencedDataFiles = writerMetrics.counter("flushedReferencedDataFiles"); - this.lastFlushDurationMs = new AtomicLong(); - writerMetrics.gauge("lastFlushDurationMs", lastFlushDurationMs::get); - - com.codahale.metrics.Histogram dropwizardDataFilesSizeHistogram = - new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); - this.dataFilesSizeHistogram = - writerMetrics.histogram( - "dataFilesSizeHistogram", - new DropwizardHistogramWrapper(dropwizardDataFilesSizeHistogram)); - com.codahale.metrics.Histogram dropwizardDeleteFilesSizeHistogram = - new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); - this.deleteFilesSizeHistogram = - writerMetrics.histogram( - "deleteFilesSizeHistogram", - new DropwizardHistogramWrapper(dropwizardDeleteFilesSizeHistogram)); - } - - public void updateFlushResult(WriteResult result) { - flushedDataFiles.inc(result.dataFiles().length); - flushedDeleteFiles.inc(result.deleteFiles().length); - flushedReferencedDataFiles.inc(result.referencedDataFiles().length); - - // For file size distribution histogram, we don't have to update them after successful commits. - // This should works equally well and we avoided the overhead of tracking the list of file sizes - // in the {@link CommitSummary}, which currently stores simple stats for counters and gauges - // metrics. - Arrays.stream(result.dataFiles()) - .forEach( - dataFile -> { - dataFilesSizeHistogram.update(dataFile.fileSizeInBytes()); - }); - Arrays.stream(result.deleteFiles()) - .forEach( - deleteFile -> { - deleteFilesSizeHistogram.update(ScanTaskUtil.contentSizeInBytes(deleteFile)); - }); - } - - public void flushDuration(long flushDurationMs) { - lastFlushDurationMs.set(flushDurationMs); - } - - public Counter getFlushedDataFiles() { - return flushedDataFiles; - } - - public Counter getFlushedDeleteFiles() { - return flushedDeleteFiles; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergWriteAggregator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergWriteAggregator.java deleted file mode 100644 index 794ade577976..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergWriteAggregator.java +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.util.Collection; -import org.apache.flink.core.io.SimpleVersionedSerialization; -import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; -import org.apache.flink.streaming.api.connector.sink2.CommittableSummary; -import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Operator which aggregates the individual {@link WriteResult} objects) to a single {@link - * IcebergCommittable} per checkpoint (storing the serialized {@link - * org.apache.iceberg.flink.sink.DeltaManifests}, jobId, operatorId, checkpointId) - */ -class IcebergWriteAggregator extends AbstractStreamOperator> - implements OneInputStreamOperator< - CommittableMessage, CommittableMessage> { - private static final Logger LOG = LoggerFactory.getLogger(IcebergWriteAggregator.class); - private static final byte[] EMPTY_MANIFEST_DATA = new byte[0]; - private final Collection results; - private transient ManifestOutputFileFactory icebergManifestOutputFileFactory; - private transient Table table; - private final TableLoader tableLoader; - - IcebergWriteAggregator(TableLoader tableLoader) { - this.results = Sets.newHashSet(); - this.tableLoader = tableLoader; - } - - @Override - public void open() throws Exception { - if (!tableLoader.isOpen()) { - tableLoader.open(); - } - - String flinkJobId = getContainingTask().getEnvironment().getJobID().toString(); - String operatorId = getOperatorID().toString(); - int subTaskId = getRuntimeContext().getTaskInfo().getIndexOfThisSubtask(); - Preconditions.checkArgument( - subTaskId == 0, "The subTaskId must be zero in the IcebergWriteAggregator"); - int attemptId = getRuntimeContext().getTaskInfo().getAttemptNumber(); - this.table = tableLoader.loadTable(); - - this.icebergManifestOutputFileFactory = - FlinkManifestUtil.createOutputFileFactory( - () -> table, table.properties(), flinkJobId, operatorId, subTaskId, attemptId); - } - - @Override - public void finish() throws IOException { - prepareSnapshotPreBarrier(Long.MAX_VALUE); - } - - @Override - public void prepareSnapshotPreBarrier(long checkpointId) throws IOException { - IcebergCommittable committable = - new IcebergCommittable( - writeToManifest(results, checkpointId), - getContainingTask().getEnvironment().getJobID().toString(), - getRuntimeContext().getOperatorUniqueID(), - checkpointId); - CommittableMessage summary = - new CommittableSummary<>(0, 1, checkpointId, 1, 1, 0); - output.collect(new StreamRecord<>(summary)); - CommittableMessage message = - new CommittableWithLineage<>(committable, checkpointId, 0); - output.collect(new StreamRecord<>(message)); - LOG.info("Emitted commit message to downstream committer operator"); - results.clear(); - } - - /** - * Write all the completed data files to a newly created manifest file and return the manifest's - * avro serialized bytes. - */ - public byte[] writeToManifest(Collection writeResults, long checkpointId) - throws IOException { - if (writeResults.isEmpty()) { - return EMPTY_MANIFEST_DATA; - } - - WriteResult result = WriteResult.builder().addAll(writeResults).build(); - DeltaManifests deltaManifests = - FlinkManifestUtil.writeCompletedFiles( - result, () -> icebergManifestOutputFileFactory.create(checkpointId), table.spec()); - - return SimpleVersionedSerialization.writeVersionAndSerialize( - DeltaManifestsSerializer.INSTANCE, deltaManifests); - } - - @Override - public void processElement(StreamRecord> element) - throws Exception { - - if (element.isRecord() && element.getValue() instanceof CommittableWithLineage) { - results.add(((CommittableWithLineage) element.getValue()).getCommittable()); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java deleted file mode 100644 index 6ba87bea30c2..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.Locale; -import java.util.Map; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Supplier; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.HasTableOperations; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Strings; - -@Internal -public class ManifestOutputFileFactory { - // Users could define their own flink manifests directory by setting this value in table - // properties. - @VisibleForTesting static final String FLINK_MANIFEST_LOCATION = "flink.manifests.location"; - private final Supplier
    tableSupplier; - private final Map props; - private final String flinkJobId; - private final String operatorUniqueId; - private final int subTaskId; - private final long attemptNumber; - private final AtomicInteger fileCount = new AtomicInteger(0); - - ManifestOutputFileFactory( - Supplier
    tableSupplier, - Map props, - String flinkJobId, - String operatorUniqueId, - int subTaskId, - long attemptNumber) { - this.tableSupplier = tableSupplier; - this.props = props; - this.flinkJobId = flinkJobId; - this.operatorUniqueId = operatorUniqueId; - this.subTaskId = subTaskId; - this.attemptNumber = attemptNumber; - } - - private String generatePath(long checkpointId) { - return FileFormat.AVRO.addExtension( - String.format( - Locale.ROOT, - "%s-%s-%05d-%d-%d-%05d", - flinkJobId, - operatorUniqueId, - subTaskId, - attemptNumber, - checkpointId, - fileCount.incrementAndGet())); - } - - public OutputFile create(long checkpointId) { - String flinkManifestDir = props.get(FLINK_MANIFEST_LOCATION); - TableOperations ops = ((HasTableOperations) tableSupplier.get()).operations(); - - String newManifestFullPath; - if (Strings.isNullOrEmpty(flinkManifestDir)) { - // User don't specify any flink manifest directory, so just use the default metadata path. - newManifestFullPath = ops.metadataFileLocation(generatePath(checkpointId)); - } else { - newManifestFullPath = - String.format("%s/%s", stripTrailingSlash(flinkManifestDir), generatePath(checkpointId)); - } - - return tableSupplier.get().io().newOutputFile(newManifestFullPath); - } - - private static String stripTrailingSlash(String path) { - String result = path; - while (result.endsWith("/")) { - result = result.substring(0, result.length() - 1); - } - return result; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java deleted file mode 100644 index 17c8233e1f6f..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.RowDataWrapper; - -/** - * Create a {@link KeySelector} to shuffle by partition key, then each partition/bucket will be - * wrote by only one task. That will reduce lots of small files in partitioned fanout write policy - * for {@link FlinkSink}. - */ -@Internal -public class PartitionKeySelector implements KeySelector { - - private final Schema schema; - private final PartitionKey partitionKey; - private final RowType flinkSchema; - - private transient RowDataWrapper rowDataWrapper; - - public PartitionKeySelector(PartitionSpec spec, Schema schema, RowType flinkSchema) { - this.schema = schema; - this.partitionKey = new PartitionKey(spec, schema); - this.flinkSchema = flinkSchema; - } - - /** - * Construct the {@link RowDataWrapper} lazily here because few members in it are not - * serializable. In this way, we don't have to serialize them with forcing. - */ - private RowDataWrapper lazyRowDataWrapper() { - if (rowDataWrapper == null) { - rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); - } - return rowDataWrapper; - } - - @Override - public String getKey(RowData row) { - partitionKey.partition(lazyRowDataWrapper().wrap(row)); - return partitionKey.toPath(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java deleted file mode 100644 index 3eb4dba80281..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.Map; -import java.util.Set; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.Tasks; - -class PartitionedDeltaWriter extends BaseDeltaTaskWriter { - - private final PartitionKey partitionKey; - - private final Map writers = Maps.newHashMap(); - - PartitionedDeltaWriter( - PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema, - Set equalityFieldIds, - boolean upsert) { - super( - spec, - format, - appenderFactory, - fileFactory, - io, - targetFileSize, - schema, - flinkSchema, - equalityFieldIds, - upsert); - this.partitionKey = new PartitionKey(spec, schema); - } - - @Override - RowDataDeltaWriter route(RowData row) { - partitionKey.partition(wrapper().wrap(row)); - - RowDataDeltaWriter writer = writers.get(partitionKey); - if (writer == null) { - // NOTICE: we need to copy a new partition key here, in case of messing up the keys in - // writers. - PartitionKey copiedKey = partitionKey.copy(); - writer = new RowDataDeltaWriter(copiedKey); - writers.put(copiedKey, writer); - } - - return writer; - } - - @Override - public void close() { - try { - Tasks.foreach(writers.values()) - .throwFailureWhenFinished() - .noRetry() - .run(RowDataDeltaWriter::close, IOException.class); - - writers.clear(); - } catch (IOException e) { - throw new UncheckedIOException("Failed to close equality delta writer", e); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java deleted file mode 100644 index 7c11b20c449d..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java +++ /dev/null @@ -1,268 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.Collection; -import java.util.Map; -import java.util.Set; -import java.util.function.Supplier; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.io.PartitionedFanoutWriter; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.UnpartitionedWriter; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.iceberg.util.SerializableSupplier; - -public class RowDataTaskWriterFactory implements TaskWriterFactory { - private final Supplier
    tableSupplier; - private final Schema schema; - private final RowType flinkSchema; - private final PartitionSpec spec; - private final long targetFileSizeBytes; - private final FileFormat format; - private final Set equalityFieldIds; - private final boolean upsert; - private final FileAppenderFactory appenderFactory; - - private transient OutputFileFactory outputFileFactory; - - public RowDataTaskWriterFactory( - Table table, - RowType flinkSchema, - long targetFileSizeBytes, - FileFormat format, - Map writeProperties, - Collection equalityFieldIds, - boolean upsert) { - this( - () -> table, - flinkSchema, - targetFileSizeBytes, - format, - writeProperties, - equalityFieldIds, - upsert); - } - - public RowDataTaskWriterFactory( - SerializableSupplier
    tableSupplier, - RowType flinkSchema, - long targetFileSizeBytes, - FileFormat format, - Map writeProperties, - Collection equalityFieldIds, - boolean upsert) { - this( - tableSupplier, - flinkSchema, - targetFileSizeBytes, - format, - writeProperties, - equalityFieldIds, - upsert, - tableSupplier.get().schema(), - tableSupplier.get().spec()); - } - - public RowDataTaskWriterFactory( - SerializableSupplier
    tableSupplier, - RowType flinkSchema, - long targetFileSizeBytes, - FileFormat format, - Map writeProperties, - Collection equalityFieldIds, - boolean upsert, - Schema schema, - PartitionSpec spec) { - this.tableSupplier = tableSupplier; - - Table table; - if (tableSupplier instanceof CachingTableSupplier) { - // rely on the initial table metadata for schema, etc., until schema evolution is supported - table = ((CachingTableSupplier) tableSupplier).initialTable(); - } else { - table = tableSupplier.get(); - } - - this.schema = schema; - this.flinkSchema = flinkSchema; - this.spec = spec; - this.targetFileSizeBytes = targetFileSizeBytes; - this.format = format; - this.equalityFieldIds = equalityFieldIds != null ? Sets.newHashSet(equalityFieldIds) : null; - this.upsert = upsert; - - if (equalityFieldIds == null || equalityFieldIds.isEmpty()) { - this.appenderFactory = - new FlinkAppenderFactory( - table, schema, flinkSchema, writeProperties, spec, null, null, null); - } else if (upsert) { - // In upsert mode, only the new row is emitted using INSERT row kind. Therefore, any column of - // the inserted row - // may differ from the deleted row other than the primary key fields, and the delete file must - // contain values - // that are correct for the deleted row. Therefore, only write the equality delete fields. - this.appenderFactory = - new FlinkAppenderFactory( - table, - schema, - flinkSchema, - writeProperties, - spec, - ArrayUtil.toPrimitive(equalityFieldIds.toArray(new Integer[0])), - TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)), - null); - } else { - this.appenderFactory = - new FlinkAppenderFactory( - table, - schema, - flinkSchema, - writeProperties, - spec, - ArrayUtil.toPrimitive(equalityFieldIds.toArray(new Integer[0])), - schema, - null); - } - } - - @Override - public void initialize(int taskId, int attemptId) { - Table table; - if (tableSupplier instanceof CachingTableSupplier) { - // rely on the initial table metadata for schema, etc., until schema evolution is supported - table = ((CachingTableSupplier) tableSupplier).initialTable(); - } else { - table = tableSupplier.get(); - } - - refreshTable(); - - this.outputFileFactory = - OutputFileFactory.builderFor(table, taskId, attemptId) - .format(format) - .ioSupplier(() -> tableSupplier.get().io()) - .defaultSpec(spec) - .build(); - } - - @Override - public TaskWriter create() { - Preconditions.checkNotNull( - outputFileFactory, - "The outputFileFactory shouldn't be null if we have invoked the initialize()."); - - refreshTable(); - - if (equalityFieldIds == null || equalityFieldIds.isEmpty()) { - // Initialize a task writer to write INSERT only. - if (spec.isUnpartitioned()) { - return new UnpartitionedWriter<>( - spec, - format, - appenderFactory, - outputFileFactory, - tableSupplier.get().io(), - targetFileSizeBytes); - } else { - return new RowDataPartitionedFanoutWriter( - spec, - format, - appenderFactory, - outputFileFactory, - tableSupplier.get().io(), - targetFileSizeBytes, - schema, - flinkSchema); - } - } else { - // Initialize a task writer to write both INSERT and equality DELETE. - if (spec.isUnpartitioned()) { - return new UnpartitionedDeltaWriter( - spec, - format, - appenderFactory, - outputFileFactory, - tableSupplier.get().io(), - targetFileSizeBytes, - schema, - flinkSchema, - equalityFieldIds, - upsert); - } else { - return new PartitionedDeltaWriter( - spec, - format, - appenderFactory, - outputFileFactory, - tableSupplier.get().io(), - targetFileSizeBytes, - schema, - flinkSchema, - equalityFieldIds, - upsert); - } - } - } - - void refreshTable() { - if (tableSupplier instanceof CachingTableSupplier) { - ((CachingTableSupplier) tableSupplier).refreshTable(); - } - } - - private static class RowDataPartitionedFanoutWriter extends PartitionedFanoutWriter { - - private final PartitionKey partitionKey; - private final RowDataWrapper rowDataWrapper; - - RowDataPartitionedFanoutWriter( - PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema) { - super(spec, format, appenderFactory, fileFactory, io, targetFileSize); - this.partitionKey = new PartitionKey(spec, schema); - this.rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); - } - - @Override - protected PartitionKey partition(RowData row) { - partitionKey.partition(rowDataWrapper.wrap(row)); - return partitionKey; - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/SinkUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/SinkUtil.java deleted file mode 100644 index b3a9ac6ba2eb..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/SinkUtil.java +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION; -import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION_LEVEL; -import static org.apache.iceberg.TableProperties.ORC_COMPRESSION; -import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY; -import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; -import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL; - -import java.util.List; -import java.util.Map; -import java.util.Set; -import javax.annotation.Nullable; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.FlinkWriteConf; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -@Internal -public class SinkUtil { - - private static final long INITIAL_CHECKPOINT_ID = -1L; - - public static final String FLINK_JOB_ID = "flink.job-id"; - - public static final String OPERATOR_ID = "flink.operator-id"; - public static final String MAX_COMMITTED_CHECKPOINT_ID = "flink.max-committed-checkpoint-id"; - - private SinkUtil() {} - - private static final Logger LOG = LoggerFactory.getLogger(SinkUtil.class); - - static Set checkAndGetEqualityFieldIds(Table table, List equalityFieldColumns) { - Set equalityFieldIds = Sets.newHashSet(table.schema().identifierFieldIds()); - if (equalityFieldColumns != null && !equalityFieldColumns.isEmpty()) { - Set equalityFieldSet = Sets.newHashSetWithExpectedSize(equalityFieldColumns.size()); - for (String column : equalityFieldColumns) { - org.apache.iceberg.types.Types.NestedField field = table.schema().findField(column); - Preconditions.checkNotNull( - field, - "Missing required equality field column '%s' in table schema %s", - column, - table.schema()); - equalityFieldSet.add(field.fieldId()); - } - - if (!equalityFieldSet.equals(table.schema().identifierFieldIds())) { - LOG.warn( - "The configured equality field column IDs {} are not matched with the schema identifier field IDs" - + " {}, use job specified equality field columns as the equality fields by default.", - equalityFieldSet, - table.schema().identifierFieldIds()); - } - equalityFieldIds = Sets.newHashSet(equalityFieldSet); - } - return equalityFieldIds; - } - - static long getMaxCommittedCheckpointId( - Table table, String flinkJobId, String operatorId, String branch) { - Snapshot snapshot = table.snapshot(branch); - long lastCommittedCheckpointId = INITIAL_CHECKPOINT_ID; - - while (snapshot != null) { - Map summary = snapshot.summary(); - String snapshotFlinkJobId = summary.get(FLINK_JOB_ID); - String snapshotOperatorId = summary.get(OPERATOR_ID); - if (flinkJobId.equals(snapshotFlinkJobId) - && (snapshotOperatorId == null || snapshotOperatorId.equals(operatorId))) { - String value = summary.get(MAX_COMMITTED_CHECKPOINT_ID); - if (value != null) { - lastCommittedCheckpointId = Long.parseLong(value); - break; - } - } - Long parentSnapshotId = snapshot.parentId(); - snapshot = parentSnapshotId != null ? table.snapshot(parentSnapshotId) : null; - } - - return lastCommittedCheckpointId; - } - - /** - * Based on the {@link FileFormat} overwrites the table level compression properties for the table - * write. - * - * @param format The FileFormat to use - * @param conf The write configuration - * @param table The table to get the table level settings - * @return The properties to use for writing - */ - public static Map writeProperties( - FileFormat format, FlinkWriteConf conf, @Nullable Table table) { - Map writeProperties = Maps.newHashMap(); - if (table != null) { - writeProperties.putAll(table.properties()); - } - - switch (format) { - case PARQUET: - writeProperties.put(PARQUET_COMPRESSION, conf.parquetCompressionCodec()); - String parquetCompressionLevel = conf.parquetCompressionLevel(); - if (parquetCompressionLevel != null) { - writeProperties.put(PARQUET_COMPRESSION_LEVEL, parquetCompressionLevel); - } - - break; - case AVRO: - writeProperties.put(AVRO_COMPRESSION, conf.avroCompressionCodec()); - String avroCompressionLevel = conf.avroCompressionLevel(); - if (avroCompressionLevel != null) { - writeProperties.put(AVRO_COMPRESSION_LEVEL, conf.avroCompressionLevel()); - } - - break; - case ORC: - writeProperties.put(ORC_COMPRESSION, conf.orcCompressionCodec()); - writeProperties.put(ORC_COMPRESSION_STRATEGY, conf.orcCompressionStrategy()); - break; - default: - throw new IllegalArgumentException(String.format("Unknown file format %s", format)); - } - - return writeProperties; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java deleted file mode 100644 index e3a1245e8cbd..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.Serializable; -import org.apache.iceberg.io.TaskWriter; - -/** - * Factory to create {@link TaskWriter} - * - * @param data type of record. - */ -public interface TaskWriterFactory extends Serializable { - - /** - * Initialize the factory with a given taskId and attemptId. - * - * @param taskId the identifier of task. - * @param attemptId the attempt id of this task. - */ - void initialize(int taskId, int attemptId); - - /** - * Initialize a {@link TaskWriter} with given task id and attempt id. - * - * @return a newly created task writer. - */ - TaskWriter create(); -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java deleted file mode 100644 index b6ad03514bb0..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.util.Set; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; - -class UnpartitionedDeltaWriter extends BaseDeltaTaskWriter { - private final RowDataDeltaWriter writer; - - UnpartitionedDeltaWriter( - PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema, - Set equalityFieldIds, - boolean upsert) { - super( - spec, - format, - appenderFactory, - fileFactory, - io, - targetFileSize, - schema, - flinkSchema, - equalityFieldIds, - upsert); - this.writer = new RowDataDeltaWriter(null); - } - - @Override - RowDataDeltaWriter route(RowData row) { - return writer; - } - - @Override - public void close() throws IOException { - writer.close(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/WriteResultSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/WriteResultSerializer.java deleted file mode 100644 index 40a3ce0cb846..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/WriteResultSerializer.java +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import org.apache.flink.annotation.Internal; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputViewStreamWrapper; -import org.apache.flink.util.InstantiationUtil; -import org.apache.iceberg.io.WriteResult; - -@Internal -public class WriteResultSerializer implements SimpleVersionedSerializer { - private static final int VERSION = 1; - - @Override - public int getVersion() { - return VERSION; - } - - @Override - public byte[] serialize(WriteResult writeResult) throws IOException { - ByteArrayOutputStream out = new ByteArrayOutputStream(); - DataOutputViewStreamWrapper view = new DataOutputViewStreamWrapper(out); - byte[] result = InstantiationUtil.serializeObject(writeResult); - view.write(result); - return out.toByteArray(); - } - - @Override - public WriteResult deserialize(int version, byte[] serialized) throws IOException { - if (version == 1) { - DataInputDeserializer view = new DataInputDeserializer(serialized); - byte[] resultBuf = new byte[serialized.length]; - view.read(resultBuf); - try { - return InstantiationUtil.deserializeObject( - resultBuf, IcebergCommittableSerializer.class.getClassLoader()); - } catch (ClassNotFoundException cnc) { - throw new IOException("Could not deserialize the WriteResult object", cnc); - } - } - throw new IOException("Unrecognized version or corrupt state: " + version); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/CompareSchemasVisitor.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/CompareSchemasVisitor.java deleted file mode 100644 index 41ffa609540b..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/CompareSchemasVisitor.java +++ /dev/null @@ -1,275 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.util.List; -import java.util.Map; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.schema.SchemaWithPartnerVisitor; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; - -/** - * Visitor class which compares an input schema to a table schema and emits a compatibility {@link - * Result}. - * - *
      - *
    • SAME: The two schemas are semantically identical - *
    • DATA_CONVERSION_NEEDED: We can evolve the data associated with the input schema to match - * the table schema. - *
    • SCHEMA_UPDATE_NEEDED: We need to migrate the table schema to match the input schema. - *
    - * - * The input schema fields are compared to the table schema via their names. - */ -public class CompareSchemasVisitor - extends SchemaWithPartnerVisitor { - - private final Schema tableSchema; - - private CompareSchemasVisitor(Schema tableSchema) { - this.tableSchema = tableSchema; - } - - public static Result visit(Schema dataSchema, Schema tableSchema) { - return visit(dataSchema, tableSchema, true); - } - - public static Result visit(Schema dataSchema, Schema tableSchema, boolean caseSensitive) { - return visit( - dataSchema, - -1, - new CompareSchemasVisitor(tableSchema), - new PartnerIdByNameAccessors(tableSchema, caseSensitive)); - } - - @Override - public Result schema(Schema dataSchema, Integer tableSchemaId, Result downstream) { - if (tableSchemaId == null) { - return Result.SCHEMA_UPDATE_NEEDED; - } - - return downstream; - } - - @Override - public Result struct(Types.StructType struct, Integer tableSchemaId, List fields) { - if (tableSchemaId == null) { - return Result.SCHEMA_UPDATE_NEEDED; - } - - Result result = fields.stream().reduce(Result::merge).orElse(Result.SCHEMA_UPDATE_NEEDED); - - if (result == Result.SCHEMA_UPDATE_NEEDED) { - return Result.SCHEMA_UPDATE_NEEDED; - } - - Type tableSchemaType = - tableSchemaId == -1 ? tableSchema.asStruct() : tableSchema.findField(tableSchemaId).type(); - if (!tableSchemaType.isStructType()) { - return Result.SCHEMA_UPDATE_NEEDED; - } - - for (Types.NestedField tableField : tableSchemaType.asStructType().fields()) { - if (tableField.isRequired() && struct.field(tableField.name()) == null) { - // If a field from the table schema does not exist in the input schema, then we won't visit - // it and check for required/optional compatibility. The only choice is to make the table - // field optional. - return Result.SCHEMA_UPDATE_NEEDED; - } - } - - if (struct.fields().size() != tableSchemaType.asStructType().fields().size()) { - return Result.DATA_CONVERSION_NEEDED; - } - - for (int i = 0; i < struct.fields().size(); ++i) { - if (!struct - .fields() - .get(i) - .name() - .equals(tableSchemaType.asStructType().fields().get(i).name())) { - return Result.DATA_CONVERSION_NEEDED; - } - } - - return result; - } - - @Override - public Result field(Types.NestedField field, Integer tableSchemaId, Result typeResult) { - if (tableSchemaId == null) { - return Result.SCHEMA_UPDATE_NEEDED; - } - - if (typeResult != Result.SAME) { - return typeResult; - } - - if (tableSchema.findField(tableSchemaId).isRequired() && field.isOptional()) { - return Result.SCHEMA_UPDATE_NEEDED; - } else { - return Result.SAME; - } - } - - @Override - public Result list(Types.ListType list, Integer tableSchemaId, Result elementsResult) { - if (tableSchemaId == null) { - return Result.SCHEMA_UPDATE_NEEDED; - } - - return elementsResult; - } - - @Override - public Result map( - Types.MapType map, Integer tableSchemaId, Result keyResult, Result valueResult) { - if (tableSchemaId == null) { - return Result.SCHEMA_UPDATE_NEEDED; - } - - return keyResult.merge(valueResult); - } - - @Override - @SuppressWarnings("checkstyle:CyclomaticComplexity") - public Result primitive(Type.PrimitiveType primitive, Integer tableSchemaId) { - if (tableSchemaId == null) { - return Result.SCHEMA_UPDATE_NEEDED; - } - - Type tableSchemaType = tableSchema.findField(tableSchemaId).type(); - if (!tableSchemaType.isPrimitiveType()) { - return Result.SCHEMA_UPDATE_NEEDED; - } - - Type.PrimitiveType tableSchemaPrimitiveType = tableSchemaType.asPrimitiveType(); - if (primitive.equals(tableSchemaPrimitiveType)) { - return Result.SAME; - } else if (primitive.equals(Types.IntegerType.get()) - && tableSchemaPrimitiveType.equals(Types.LongType.get())) { - return Result.DATA_CONVERSION_NEEDED; - } else if (primitive.equals(Types.FloatType.get()) - && tableSchemaPrimitiveType.equals(Types.DoubleType.get())) { - return Result.DATA_CONVERSION_NEEDED; - } else if (primitive.equals(Types.DateType.get()) - && tableSchemaPrimitiveType.equals(Types.TimestampType.withoutZone())) { - return Result.DATA_CONVERSION_NEEDED; - } else if (primitive.typeId() == Type.TypeID.DECIMAL - && tableSchemaPrimitiveType.typeId() == Type.TypeID.DECIMAL) { - Types.DecimalType dataType = (Types.DecimalType) primitive; - Types.DecimalType tableType = (Types.DecimalType) tableSchemaPrimitiveType; - return dataType.scale() == tableType.scale() && dataType.precision() < tableType.precision() - ? Result.DATA_CONVERSION_NEEDED - : Result.SCHEMA_UPDATE_NEEDED; - } else { - return Result.SCHEMA_UPDATE_NEEDED; - } - } - - static class PartnerIdByNameAccessors implements PartnerAccessors { - private final Schema tableSchema; - private boolean caseSensitive = true; - - PartnerIdByNameAccessors(Schema tableSchema) { - this.tableSchema = tableSchema; - } - - private PartnerIdByNameAccessors(Schema tableSchema, boolean caseSensitive) { - this(tableSchema); - this.caseSensitive = caseSensitive; - } - - @Override - public Integer fieldPartner(Integer tableSchemaFieldId, int fieldId, String name) { - Types.StructType struct; - if (tableSchemaFieldId == -1) { - struct = tableSchema.asStruct(); - } else { - struct = tableSchema.findField(tableSchemaFieldId).type().asStructType(); - } - - Types.NestedField field = - caseSensitive ? struct.field(name) : struct.caseInsensitiveField(name); - if (field != null) { - return field.fieldId(); - } - - return null; - } - - @Override - public Integer mapKeyPartner(Integer tableSchemaMapId) { - Types.NestedField mapField = tableSchema.findField(tableSchemaMapId); - if (mapField != null) { - return mapField.type().asMapType().fields().get(0).fieldId(); - } - - return null; - } - - @Override - public Integer mapValuePartner(Integer tableSchemaMapId) { - Types.NestedField mapField = tableSchema.findField(tableSchemaMapId); - if (mapField != null) { - return mapField.type().asMapType().fields().get(1).fieldId(); - } - - return null; - } - - @Override - public Integer listElementPartner(Integer tableSchemaListId) { - Types.NestedField listField = tableSchema.findField(tableSchemaListId); - if (listField != null) { - return listField.type().asListType().fields().get(0).fieldId(); - } - - return null; - } - } - - public enum Result { - SAME(0), - DATA_CONVERSION_NEEDED(1), - SCHEMA_UPDATE_NEEDED(2); - - private static final Map BY_ID = Maps.newHashMap(); - - static { - for (Result e : Result.values()) { - if (BY_ID.put(e.id, e) != null) { - throw new IllegalArgumentException("Duplicate id: " + e.id); - } - } - } - - private final int id; - - Result(int id) { - this.id = id; - } - - private Result merge(Result other) { - return BY_ID.get(Math.max(this.id, other.id)); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DataConverter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DataConverter.java deleted file mode 100644 index 34da5efd940f..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DataConverter.java +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.util.Map; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.DecimalType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -/** - * {@link org.apache.iceberg.flink.sink.dynamic.DataConverter} is responsible to change the input - * data to make it compatible with the target schema. This is done when - * - *
      - *
    • The input schema has fewer fields than the target schema. - *
    • The table types are wider than the input type. - *
    • The field order differs for source and target schema. - *
    - * - *

    The resolution is as follows: - * - *

      - *
    • In the first case, we would add a null values for the missing field (if the field is - * optional). - *
    • In the second case, we would convert the data for the input field to a wider type, e.g. int - * (input type) => long (table type). - *
    • In the third case, we would rearrange the input data to match the target table. - *
    - */ -interface DataConverter { - Object convert(Object object); - - static DataConverter identity() { - return object -> object; - } - - static DataConverter getNullable(LogicalType sourceType, LogicalType targetType) { - return nullable(get(sourceType, targetType)); - } - - static DataConverter get(LogicalType sourceType, LogicalType targetType) { - switch (targetType.getTypeRoot()) { - case BOOLEAN: - case INTEGER: - case FLOAT: - case VARCHAR: - case DATE: - case TIME_WITHOUT_TIME_ZONE: - case TIMESTAMP_WITH_LOCAL_TIME_ZONE: - case BINARY: - case VARBINARY: - return object -> object; - case DOUBLE: - return object -> { - if (object instanceof Float) { - return ((Float) object).doubleValue(); - } else { - return object; - } - }; - case BIGINT: - return object -> { - if (object instanceof Integer) { - return ((Integer) object).longValue(); - } else { - return object; - } - }; - case DECIMAL: - return object -> { - DecimalType toDecimalType = (DecimalType) targetType; - DecimalData decimalData = (DecimalData) object; - if (((DecimalType) sourceType).getPrecision() == toDecimalType.getPrecision()) { - return object; - } else { - return DecimalData.fromBigDecimal( - decimalData.toBigDecimal(), toDecimalType.getPrecision(), toDecimalType.getScale()); - } - }; - case TIMESTAMP_WITHOUT_TIME_ZONE: - return object -> { - if (object instanceof Integer) { - LocalDateTime dateTime = - LocalDateTime.of(LocalDate.ofEpochDay((Integer) object), LocalTime.MIN); - return TimestampData.fromLocalDateTime(dateTime); - } else { - return object; - } - }; - case ROW: - return new RowDataConverter((RowType) sourceType, (RowType) targetType); - case ARRAY: - return new ArrayConverter((ArrayType) sourceType, (ArrayType) targetType); - case MAP: - return new MapConverter((MapType) sourceType, (MapType) targetType); - default: - throw new UnsupportedOperationException("Not a supported type: " + targetType); - } - } - - static DataConverter nullable(DataConverter converter) { - return value -> value == null ? null : converter.convert(value); - } - - class RowDataConverter implements DataConverter { - private final RowData.FieldGetter[] fieldGetters; - private final DataConverter[] dataConverters; - - RowDataConverter(RowType sourceType, RowType targetType) { - this.fieldGetters = new RowData.FieldGetter[targetType.getFields().size()]; - this.dataConverters = new DataConverter[targetType.getFields().size()]; - - for (int i = 0; i < targetType.getFields().size(); i++) { - RowData.FieldGetter fieldGetter; - DataConverter dataConverter; - RowType.RowField targetField = targetType.getFields().get(i); - int sourceFieldIndex = sourceType.getFieldIndex(targetField.getName()); - if (sourceFieldIndex == -1) { - if (targetField.getType().isNullable()) { - fieldGetter = row -> null; - dataConverter = value -> null; - } else { - throw new IllegalArgumentException( - String.format( - "Field %s in target schema %s is non-nullable but does not exist in source schema.", - i + 1, targetType)); - } - } else { - RowType.RowField sourceField = sourceType.getFields().get(sourceFieldIndex); - fieldGetter = RowData.createFieldGetter(sourceField.getType(), sourceFieldIndex); - dataConverter = DataConverter.getNullable(sourceField.getType(), targetField.getType()); - } - - this.fieldGetters[i] = fieldGetter; - this.dataConverters[i] = dataConverter; - } - } - - @Override - public RowData convert(Object object) { - RowData sourceData = (RowData) object; - GenericRowData targetData = new GenericRowData(fieldGetters.length); - for (int i = 0; i < fieldGetters.length; i++) { - Object value = fieldGetters[i].getFieldOrNull(sourceData); - targetData.setField(i, dataConverters[i].convert(value)); - } - - return targetData; - } - } - - class ArrayConverter implements DataConverter { - private final ArrayData.ElementGetter elementGetter; - private final DataConverter elementConverter; - - ArrayConverter(ArrayType sourceType, ArrayType targetType) { - this.elementGetter = ArrayData.createElementGetter(sourceType.getElementType()); - this.elementConverter = - DataConverter.getNullable(sourceType.getElementType(), targetType.getElementType()); - } - - @Override - public ArrayData convert(Object object) { - ArrayData arrayData = (ArrayData) object; - Object[] convertedArray = new Object[arrayData.size()]; - for (int i = 0; i < convertedArray.length; i++) { - Object element = elementGetter.getElementOrNull(arrayData, i); - convertedArray[i] = elementConverter.convert(element); - } - - return new GenericArrayData(convertedArray); - } - } - - class MapConverter implements DataConverter { - private final ArrayData.ElementGetter keyGetter; - private final ArrayData.ElementGetter valueGetter; - private final DataConverter keyConverter; - private final DataConverter valueConverter; - - MapConverter(MapType sourceType, MapType targetType) { - this.keyGetter = ArrayData.createElementGetter(sourceType.getKeyType()); - this.valueGetter = ArrayData.createElementGetter(sourceType.getValueType()); - this.keyConverter = - DataConverter.getNullable(sourceType.getKeyType(), targetType.getKeyType()); - this.valueConverter = - DataConverter.getNullable(sourceType.getValueType(), targetType.getValueType()); - } - - @Override - public MapData convert(Object object) { - MapData sourceData = (MapData) object; - ArrayData keyArray = sourceData.keyArray(); - ArrayData valueArray = sourceData.valueArray(); - Map convertedMap = Maps.newLinkedHashMap(); - for (int i = 0; i < keyArray.size(); ++i) { - convertedMap.put( - keyConverter.convert(keyGetter.getElementOrNull(keyArray, i)), - valueConverter.convert(valueGetter.getElementOrNull(valueArray, i))); - } - - return new GenericMapData(convertedMap); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittable.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittable.java deleted file mode 100644 index 33edefe71eb0..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittable.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.io.Serializable; -import java.util.Arrays; -import java.util.Objects; -import org.apache.iceberg.flink.sink.DeltaManifests; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; - -/** - * The aggregated results of a single checkpoint which should be committed. Containing the - * serialized {@link DeltaManifests} file - which contains the commit data, and the jobId, - * operatorId, checkpointId triplet to identify the specific commit. - * - *

    {@link DynamicCommittableSerializer} is used to serialize {@link DynamicCommittable} between - * the {@link DynamicWriter} and the {@link DynamicWriteResultAggregator}. - */ -class DynamicCommittable implements Serializable { - - private final WriteTarget key; - private final byte[] manifest; - private final String jobId; - private final String operatorId; - private final long checkpointId; - - DynamicCommittable( - WriteTarget key, byte[] manifest, String jobId, String operatorId, long checkpointId) { - this.key = key; - this.manifest = manifest; - this.jobId = jobId; - this.operatorId = operatorId; - this.checkpointId = checkpointId; - } - - WriteTarget key() { - return key; - } - - byte[] manifest() { - return manifest; - } - - String jobId() { - return jobId; - } - - String operatorId() { - return operatorId; - } - - long checkpointId() { - return checkpointId; - } - - @Override - public boolean equals(Object o) { - if (o == null || getClass() != o.getClass()) { - return false; - } - - DynamicCommittable that = (DynamicCommittable) o; - return checkpointId == that.checkpointId - && Objects.equals(key, that.key) - && Objects.deepEquals(manifest, that.manifest) - && Objects.equals(jobId, that.jobId) - && Objects.equals(operatorId, that.operatorId); - } - - @Override - public int hashCode() { - return Objects.hash(key, Arrays.hashCode(manifest), jobId, operatorId, checkpointId); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("key", key) - .add("jobId", jobId) - .add("checkpointId", checkpointId) - .add("operatorId", operatorId) - .toString(); - } - - public WriteTarget writeTarget() { - return key; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittableSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittableSerializer.java deleted file mode 100644 index 4aadcf1f3620..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommittableSerializer.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputViewStreamWrapper; - -/** - * This serializer is used for serializing the {@link DynamicCommittable} objects between the {@link - * DynamicWriter} and the {@link DynamicWriteResultAggregator} operator and for sending it down to - * the {@link DynamicCommitter}. - */ -class DynamicCommittableSerializer implements SimpleVersionedSerializer { - - private static final int VERSION = 1; - - @Override - public int getVersion() { - return VERSION; - } - - @Override - public byte[] serialize(DynamicCommittable committable) throws IOException { - ByteArrayOutputStream out = new ByteArrayOutputStream(); - DataOutputViewStreamWrapper view = new DataOutputViewStreamWrapper(out); - committable.key().serializeTo(view); - view.writeUTF(committable.jobId()); - view.writeUTF(committable.operatorId()); - view.writeLong(committable.checkpointId()); - view.writeInt(committable.manifest().length); - view.write(committable.manifest()); - return out.toByteArray(); - } - - @Override - public DynamicCommittable deserialize(int version, byte[] serialized) throws IOException { - if (version == 1) { - DataInputDeserializer view = new DataInputDeserializer(serialized); - WriteTarget key = WriteTarget.deserializeFrom(view); - String jobId = view.readUTF(); - String operatorId = view.readUTF(); - long checkpointId = view.readLong(); - int manifestLen = view.readInt(); - byte[] manifestBuf; - manifestBuf = new byte[manifestLen]; - view.read(manifestBuf); - return new DynamicCommittable(key, manifestBuf, jobId, operatorId, checkpointId); - } - - throw new IOException("Unrecognized version or corrupt state: " + version); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitter.java deleted file mode 100644 index e58066aac6ca..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitter.java +++ /dev/null @@ -1,424 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.io.IOException; -import java.io.Serializable; -import java.util.Arrays; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.NavigableMap; -import java.util.Objects; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.TimeUnit; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.connector.sink2.Committer; -import org.apache.flink.core.io.SimpleVersionedSerialization; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ReplacePartitions; -import org.apache.iceberg.RowDelta; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.SnapshotSummary; -import org.apache.iceberg.SnapshotUpdate; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.sink.CommitSummary; -import org.apache.iceberg.flink.sink.DeltaManifests; -import org.apache.iceberg.flink.sink.DeltaManifestsSerializer; -import org.apache.iceberg.flink.sink.FlinkManifestUtil; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.PropertyUtil; -import org.apache.iceberg.util.ThreadPools; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * This class implements the Flink SinkV2 {@link Committer} interface to implement the Iceberg - * commits. The implementation builds on the following assumptions: - * - *

      - *
    • There is a single {@link DynamicCommittable} for every table / branch / checkpoint - *
    • There is no late checkpoint - if checkpoint 'x' has received in one call, then after a - * successful run only checkpoints > x will arrive - *
    • There is no other writer which would generate another commit to the same branch with the - * same jobId-operatorId-checkpointId triplet - *
    - */ -@Internal -class DynamicCommitter implements Committer { - - private static final String MAX_COMMITTED_CHECKPOINT_ID = "flink.max-committed-checkpoint-id"; - private static final Logger LOG = LoggerFactory.getLogger(DynamicCommitter.class); - private static final byte[] EMPTY_MANIFEST_DATA = new byte[0]; - private static final WriteResult EMPTY_WRITE_RESULT = - WriteResult.builder() - .addDataFiles(Lists.newArrayList()) - .addDeleteFiles(Lists.newArrayList()) - .build(); - - private static final long INITIAL_CHECKPOINT_ID = -1L; - - @VisibleForTesting - static final String MAX_CONTINUOUS_EMPTY_COMMITS = "flink.max-continuous-empty-commits"; - - private static final String FLINK_JOB_ID = "flink.job-id"; - private static final String OPERATOR_ID = "flink.operator-id"; - private final Map snapshotProperties; - private final boolean replacePartitions; - private final DynamicCommitterMetrics committerMetrics; - private final Catalog catalog; - private final Map maxContinuousEmptyCommitsMap; - private final Map continuousEmptyCheckpointsMap; - private final ExecutorService workerPool; - - DynamicCommitter( - Catalog catalog, - Map snapshotProperties, - boolean replacePartitions, - int workerPoolSize, - String sinkId, - DynamicCommitterMetrics committerMetrics) { - this.snapshotProperties = snapshotProperties; - this.replacePartitions = replacePartitions; - this.committerMetrics = committerMetrics; - this.catalog = catalog; - this.maxContinuousEmptyCommitsMap = Maps.newHashMap(); - this.continuousEmptyCheckpointsMap = Maps.newHashMap(); - - this.workerPool = - ThreadPools.newFixedThreadPool("iceberg-committer-pool-" + sinkId, workerPoolSize); - } - - @Override - public void commit(Collection> commitRequests) - throws IOException, InterruptedException { - if (commitRequests.isEmpty()) { - return; - } - - // For every table and every checkpoint, we store the list of to-be-committed - // DynamicCommittable. - // There may be DynamicCommittable from previous checkpoints which have not been committed yet. - Map>>> commitRequestMap = - Maps.newHashMap(); - for (CommitRequest request : commitRequests) { - NavigableMap>> committables = - commitRequestMap.computeIfAbsent( - new TableKey(request.getCommittable()), unused -> Maps.newTreeMap()); - committables - .computeIfAbsent(request.getCommittable().checkpointId(), unused -> Lists.newArrayList()) - .add(request); - } - - for (Map.Entry>>> entry : - commitRequestMap.entrySet()) { - Table table = catalog.loadTable(TableIdentifier.parse(entry.getKey().tableName())); - DynamicCommittable last = entry.getValue().lastEntry().getValue().get(0).getCommittable(); - long maxCommittedCheckpointId = - getMaxCommittedCheckpointId( - table, last.jobId(), last.operatorId(), entry.getKey().branch()); - // Mark the already committed FilesCommittable(s) as finished - entry - .getValue() - .headMap(maxCommittedCheckpointId, true) - .values() - .forEach(list -> list.forEach(CommitRequest::signalAlreadyCommitted)); - NavigableMap>> uncommitted = - entry.getValue().tailMap(maxCommittedCheckpointId, false); - if (!uncommitted.isEmpty()) { - commitPendingRequests( - table, entry.getKey().branch(), uncommitted, last.jobId(), last.operatorId()); - } - } - } - - private static long getMaxCommittedCheckpointId( - Table table, String flinkJobId, String operatorId, String branch) { - Snapshot snapshot = table.snapshot(branch); - long lastCommittedCheckpointId = INITIAL_CHECKPOINT_ID; - - while (snapshot != null) { - Map summary = snapshot.summary(); - String snapshotFlinkJobId = summary.get(FLINK_JOB_ID); - String snapshotOperatorId = summary.get(OPERATOR_ID); - if (flinkJobId.equals(snapshotFlinkJobId) - && (snapshotOperatorId == null || snapshotOperatorId.equals(operatorId))) { - String value = summary.get(MAX_COMMITTED_CHECKPOINT_ID); - if (value != null) { - lastCommittedCheckpointId = Long.parseLong(value); - break; - } - } - - Long parentSnapshotId = snapshot.parentId(); - snapshot = parentSnapshotId != null ? table.snapshot(parentSnapshotId) : null; - } - - return lastCommittedCheckpointId; - } - - /** - * Commits the data to the Iceberg table by reading the file data from the {@link DeltaManifests} - * ordered by the checkpointId, and writing the new snapshot to the Iceberg table. The {@link - * SnapshotSummary} will contain the jobId, snapshotId, checkpointId so in case of job restart we - * can identify which changes are committed, and which are still waiting for the commit. - * - * @param commitRequestMap The checkpointId to {@link CommitRequest} map of the changes to commit - * @param newFlinkJobId The jobId to store in the {@link SnapshotSummary} - * @param operatorId The operatorId to store in the {@link SnapshotSummary} - * @throws IOException On commit failure - */ - private void commitPendingRequests( - Table table, - String branch, - NavigableMap>> commitRequestMap, - String newFlinkJobId, - String operatorId) - throws IOException { - long checkpointId = commitRequestMap.lastKey(); - List manifests = Lists.newArrayList(); - NavigableMap> pendingResults = Maps.newTreeMap(); - for (Map.Entry>> e : commitRequestMap.entrySet()) { - for (CommitRequest committable : e.getValue()) { - if (Arrays.equals(EMPTY_MANIFEST_DATA, committable.getCommittable().manifest())) { - pendingResults - .computeIfAbsent(e.getKey(), unused -> Lists.newArrayList()) - .add(EMPTY_WRITE_RESULT); - } else { - DeltaManifests deltaManifests = - SimpleVersionedSerialization.readVersionAndDeSerialize( - DeltaManifestsSerializer.INSTANCE, committable.getCommittable().manifest()); - pendingResults - .computeIfAbsent(e.getKey(), unused -> Lists.newArrayList()) - .add(FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io(), table.specs())); - manifests.addAll(deltaManifests.manifests()); - } - } - } - - CommitSummary summary = new CommitSummary(); - summary.addAll(pendingResults); - commitPendingResult(table, branch, pendingResults, summary, newFlinkJobId, operatorId); - if (committerMetrics != null) { - committerMetrics.updateCommitSummary(table.name(), summary); - } - - FlinkManifestUtil.deleteCommittedManifests(table, manifests, newFlinkJobId, checkpointId); - } - - private void commitPendingResult( - Table table, - String branch, - NavigableMap> pendingResults, - CommitSummary summary, - String newFlinkJobId, - String operatorId) { - long totalFiles = summary.dataFilesCount() + summary.deleteFilesCount(); - TableKey key = new TableKey(table.name(), branch); - int continuousEmptyCheckpoints = - continuousEmptyCheckpointsMap.computeIfAbsent(key, unused -> 0); - int maxContinuousEmptyCommits = - maxContinuousEmptyCommitsMap.computeIfAbsent( - key, - unused -> { - int result = - PropertyUtil.propertyAsInt(table.properties(), MAX_CONTINUOUS_EMPTY_COMMITS, 10); - Preconditions.checkArgument( - result > 0, MAX_CONTINUOUS_EMPTY_COMMITS + " must be positive"); - return result; - }); - continuousEmptyCheckpoints = totalFiles == 0 ? continuousEmptyCheckpoints + 1 : 0; - if (totalFiles != 0 || continuousEmptyCheckpoints % maxContinuousEmptyCommits == 0) { - if (replacePartitions) { - replacePartitions(table, branch, pendingResults, summary, newFlinkJobId, operatorId); - } else { - commitDeltaTxn(table, branch, pendingResults, summary, newFlinkJobId, operatorId); - } - - continuousEmptyCheckpoints = 0; - } else { - long checkpointId = pendingResults.lastKey(); - LOG.info("Skip commit for checkpoint {} due to no data files or delete files.", checkpointId); - } - - continuousEmptyCheckpointsMap.put(key, continuousEmptyCheckpoints); - } - - private void replacePartitions( - Table table, - String branch, - NavigableMap> pendingResults, - CommitSummary summary, - String newFlinkJobId, - String operatorId) { - for (Map.Entry> e : pendingResults.entrySet()) { - // We don't commit the merged result into a single transaction because for the sequential - // transaction txn1 and txn2, the equality-delete files of txn2 are required to be applied - // to data files from txn1. Committing the merged one will lead to the incorrect delete - // semantic. - for (WriteResult result : e.getValue()) { - ReplacePartitions dynamicOverwrite = - table.newReplacePartitions().scanManifestsWith(workerPool); - Arrays.stream(result.dataFiles()).forEach(dynamicOverwrite::addFile); - commitOperation( - table, - branch, - dynamicOverwrite, - summary, - "dynamic partition overwrite", - newFlinkJobId, - operatorId, - e.getKey()); - } - } - } - - private void commitDeltaTxn( - Table table, - String branch, - NavigableMap> pendingResults, - CommitSummary summary, - String newFlinkJobId, - String operatorId) { - for (Map.Entry> e : pendingResults.entrySet()) { - // We don't commit the merged result into a single transaction because for the sequential - // transaction txn1 and txn2, the equality-delete files of txn2 are required to be applied - // to data files from txn1. Committing the merged one will lead to the incorrect delete - // semantic. - for (WriteResult result : e.getValue()) { - // Row delta validations are not needed for streaming changes that write equality deletes. - // Equality deletes are applied to data in all previous sequence numbers, so retries may - // push deletes further in the future, but do not affect correctness. Position deletes - // committed to the table in this path are used only to delete rows from data files that are - // being added in this commit. There is no way for data files added along with the delete - // files to be concurrently removed, so there is no need to validate the files referenced by - // the position delete files that are being committed. - RowDelta rowDelta = table.newRowDelta().scanManifestsWith(workerPool); - - Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows); - Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); - commitOperation( - table, branch, rowDelta, summary, "rowDelta", newFlinkJobId, operatorId, e.getKey()); - } - } - } - - @VisibleForTesting - void commitOperation( - Table table, - String branch, - SnapshotUpdate operation, - CommitSummary summary, - String description, - String newFlinkJobId, - String operatorId, - long checkpointId) { - - LOG.info( - "Committing {} for checkpoint {} to table {} branch {} with summary: {}", - description, - checkpointId, - table.name(), - branch, - summary); - snapshotProperties.forEach(operation::set); - // custom snapshot metadata properties will be overridden if they conflict with internal ones - // used by the sink. - operation.set(MAX_COMMITTED_CHECKPOINT_ID, Long.toString(checkpointId)); - operation.set(FLINK_JOB_ID, newFlinkJobId); - operation.set(OPERATOR_ID, operatorId); - operation.toBranch(branch); - - long startNano = System.nanoTime(); - operation.commit(); // abort is automatically called if this fails. - long durationMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano); - LOG.info( - "Committed {} to table: {}, branch: {}, checkpointId {} in {} ms", - description, - table.name(), - branch, - checkpointId, - durationMs); - if (committerMetrics != null) { - committerMetrics.commitDuration(table.name(), durationMs); - } - } - - @Override - public void close() throws IOException { - workerPool.shutdown(); - } - - private static class TableKey implements Serializable { - private String tableName; - private String branch; - - TableKey(String tableName, String branch) { - this.tableName = tableName; - this.branch = branch; - } - - TableKey(DynamicCommittable committable) { - this.tableName = committable.key().tableName(); - this.branch = committable.key().branch(); - } - - String tableName() { - return tableName; - } - - String branch() { - return branch; - } - - @Override - public boolean equals(Object other) { - if (this == other) { - return true; - } - - if (other == null || getClass() != other.getClass()) { - return false; - } - - TableKey that = (TableKey) other; - return tableName.equals(that.tableName) && branch.equals(that.branch); - } - - @Override - public int hashCode() { - return Objects.hash(tableName, branch); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("tableName", tableName) - .add("branch", branch) - .toString(); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitterMetrics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitterMetrics.java deleted file mode 100644 index d34feea75285..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicCommitterMetrics.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.util.Map; -import org.apache.flink.metrics.MetricGroup; -import org.apache.iceberg.flink.sink.CommitSummary; -import org.apache.iceberg.flink.sink.IcebergFilesCommitterMetrics; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -class DynamicCommitterMetrics { - - private final Map metrics; - private final MetricGroup mainMetricsGroup; - - DynamicCommitterMetrics(MetricGroup mainMetricsGroup) { - this.mainMetricsGroup = mainMetricsGroup; - this.metrics = Maps.newHashMap(); - } - - public void commitDuration(String fullTableName, long commitDurationMs) { - committerMetrics(fullTableName).commitDuration(commitDurationMs); - } - - /** This is called upon a successful commit. */ - public void updateCommitSummary(String fullTableName, CommitSummary stats) { - committerMetrics(fullTableName).updateCommitSummary(stats); - } - - private IcebergFilesCommitterMetrics committerMetrics(String fullTableName) { - return metrics.computeIfAbsent( - fullTableName, tableName -> new IcebergFilesCommitterMetrics(mainMetricsGroup, tableName)); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java deleted file mode 100644 index 9547de78d6ba..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java +++ /dev/null @@ -1,427 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.io.IOException; -import java.util.Map; -import java.util.Optional; -import java.util.UUID; -import org.apache.flink.annotation.Experimental; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.connector.sink2.Committer; -import org.apache.flink.api.connector.sink2.CommitterInitContext; -import org.apache.flink.api.connector.sink2.Sink; -import org.apache.flink.api.connector.sink2.SinkWriter; -import org.apache.flink.api.connector.sink2.SupportsCommitter; -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; -import org.apache.flink.streaming.api.connector.sink2.CommittableMessageTypeInfo; -import org.apache.flink.streaming.api.connector.sink2.CommittableSummary; -import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; -import org.apache.flink.streaming.api.connector.sink2.SupportsPostCommitTopology; -import org.apache.flink.streaming.api.connector.sink2.SupportsPreCommitTopology; -import org.apache.flink.streaming.api.connector.sink2.SupportsPreWriteTopology; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.DataStreamSink; -import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; -import org.apache.flink.table.data.RowData; -import org.apache.flink.util.OutputTag; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.CatalogLoader; -import org.apache.iceberg.flink.FlinkWriteConf; -import org.apache.iceberg.flink.FlinkWriteOptions; -import org.apache.iceberg.flink.sink.IcebergSink; -import org.apache.iceberg.flink.sink.SinkUtil; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -/** - * Dynamic version of the IcebergSink which supports: - * - *
      - *
    • Writing to any number of tables (No more 1:1 sink/topic relationship). - *
    • Creating and updating tables based on the user-supplied routing. - *
    • Updating the schema and partition spec of tables based on the user-supplied specification. - *
    - */ -@Experimental -public class DynamicIcebergSink - implements Sink, - SupportsPreWriteTopology, - SupportsCommitter, - SupportsPreCommitTopology, - SupportsPostCommitTopology { - - private final CatalogLoader catalogLoader; - private final Map snapshotProperties; - private final String uidPrefix; - private final String sinkId; - private final Map writeProperties; - private final transient FlinkWriteConf flinkWriteConf; - private final FileFormat dataFileFormat; - private final long targetDataFileSize; - private final boolean overwriteMode; - private final int workerPoolSize; - private final int cacheMaximumSize; - - DynamicIcebergSink( - CatalogLoader catalogLoader, - Map snapshotProperties, - String uidPrefix, - Map writeProperties, - FlinkWriteConf flinkWriteConf, - int cacheMaximumSize) { - this.catalogLoader = catalogLoader; - this.snapshotProperties = snapshotProperties; - this.uidPrefix = uidPrefix; - this.writeProperties = writeProperties; - this.flinkWriteConf = flinkWriteConf; - this.dataFileFormat = flinkWriteConf.dataFileFormat(); - this.targetDataFileSize = flinkWriteConf.targetDataFileSize(); - this.overwriteMode = flinkWriteConf.overwriteMode(); - this.workerPoolSize = flinkWriteConf.workerPoolSize(); - this.cacheMaximumSize = cacheMaximumSize; - // We generate a random UUID every time when a sink is created. - // This is used to separate files generated by different sinks writing the same table. - // Also used to generate the aggregator operator name - this.sinkId = UUID.randomUUID().toString(); - } - - @Override - public SinkWriter createWriter(InitContext context) throws IOException { - return new DynamicWriter( - catalogLoader.loadCatalog(), - dataFileFormat, - targetDataFileSize, - writeProperties, - cacheMaximumSize, - new DynamicWriterMetrics(context.metricGroup()), - context.getTaskInfo().getIndexOfThisSubtask(), - context.getTaskInfo().getAttemptNumber()); - } - - @Override - public Committer createCommitter(CommitterInitContext context) { - DynamicCommitterMetrics metrics = new DynamicCommitterMetrics(context.metricGroup()); - return new DynamicCommitter( - catalogLoader.loadCatalog(), - snapshotProperties, - overwriteMode, - workerPoolSize, - sinkId, - metrics); - } - - @Override - public SimpleVersionedSerializer getCommittableSerializer() { - return new DynamicCommittableSerializer(); - } - - @Override - public void addPostCommitTopology( - DataStream> committables) {} - - @Override - public DataStream addPreWriteTopology( - DataStream inputDataStream) { - return distributeDataStream(inputDataStream); - } - - @Override - public DataStream> addPreCommitTopology( - DataStream> writeResults) { - TypeInformation> typeInformation = - CommittableMessageTypeInfo.of(this::getCommittableSerializer); - - return writeResults - .keyBy( - committable -> { - if (committable instanceof CommittableSummary) { - return "__summary"; - } else { - CommittableWithLineage result = - (CommittableWithLineage) committable; - return result.getCommittable().key().tableName(); - } - }) - .transform( - prefixIfNotNull(uidPrefix, sinkId + " Pre Commit"), - typeInformation, - new DynamicWriteResultAggregator(catalogLoader)) - .uid(prefixIfNotNull(uidPrefix, sinkId + "-pre-commit-topology")); - } - - @Override - public SimpleVersionedSerializer getWriteResultSerializer() { - return new DynamicWriteResultSerializer(); - } - - public static class Builder { - private DataStream input; - private DynamicRecordGenerator generator; - private CatalogLoader catalogLoader; - private String uidPrefix = null; - private final Map writeOptions = Maps.newHashMap(); - private final Map snapshotSummary = Maps.newHashMap(); - private ReadableConfig readableConfig = new Configuration(); - private boolean immediateUpdate = false; - private int cacheMaximumSize = 100; - private long cacheRefreshMs = 1_000; - private int inputSchemasPerTableCacheMaximumSize = 10; - - Builder() {} - - public Builder forInput(DataStream inputStream) { - this.input = inputStream; - return this; - } - - public Builder generator(DynamicRecordGenerator inputGenerator) { - this.generator = inputGenerator; - return this; - } - - /** - * The catalog loader is used for loading tables in {@link DynamicCommitter} lazily, we need - * this loader because {@link Table} is not serializable and could not just use the loaded table - * from Builder#table in the remote task manager. - * - * @param newCatalogLoader to load iceberg table inside tasks. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder catalogLoader(CatalogLoader newCatalogLoader) { - this.catalogLoader = newCatalogLoader; - return this; - } - - /** - * Set the write properties for IcebergSink. View the supported properties in {@link - * FlinkWriteOptions} - */ - public Builder set(String property, String value) { - writeOptions.put(property, value); - return this; - } - - /** - * Set the write properties for IcebergSink. View the supported properties in {@link - * FlinkWriteOptions} - */ - public Builder setAll(Map properties) { - writeOptions.putAll(properties); - return this; - } - - public Builder overwrite(boolean newOverwrite) { - writeOptions.put(FlinkWriteOptions.OVERWRITE_MODE.key(), Boolean.toString(newOverwrite)); - return this; - } - - public Builder flinkConf(ReadableConfig config) { - this.readableConfig = config; - return this; - } - - /** - * Configuring the write parallel number for iceberg stream writer. - * - * @param newWriteParallelism the number of parallel iceberg stream writer. - * @return {@link DynamicIcebergSink.Builder} to connect the iceberg table. - */ - public Builder writeParallelism(int newWriteParallelism) { - writeOptions.put( - FlinkWriteOptions.WRITE_PARALLELISM.key(), Integer.toString(newWriteParallelism)); - return this; - } - - /** - * Set the uid prefix for IcebergSink operators. Note that IcebergSink internally consists of - * multiple operators (like writer, committer, aggregator) Actual operator uid will be appended - * with a suffix like "uidPrefix-writer". - * - *

    If provided, this prefix is also applied to operator names. - * - *

    Flink auto generates operator uid if not set explicitly. It is a recommended - * best-practice to set uid for all operators before deploying to production. Flink has an - * option to {@code pipeline.auto-generate-uid=false} to disable auto-generation and force - * explicit setting of all operator uid. - * - *

    Be careful with setting this for an existing job, because now we are changing the operator - * uid from an auto-generated one to this new value. When deploying the change with a - * checkpoint, Flink won't be able to restore the previous IcebergSink operator state (more - * specifically the committer operator state). You need to use {@code --allowNonRestoredState} - * to ignore the previous sink state. During restore IcebergSink state is used to check if last - * commit was actually successful or not. {@code --allowNonRestoredState} can lead to data loss - * if the Iceberg commit failed in the last completed checkpoint. - * - * @param newPrefix prefix for Flink sink operator uid and name - * @return {@link Builder} to connect the iceberg table. - */ - public Builder uidPrefix(String newPrefix) { - this.uidPrefix = newPrefix; - return this; - } - - public Builder snapshotProperties(Map properties) { - snapshotSummary.putAll(properties); - return this; - } - - public Builder setSnapshotProperty(String property, String value) { - snapshotSummary.put(property, value); - return this; - } - - public Builder toBranch(String branch) { - writeOptions.put(FlinkWriteOptions.BRANCH.key(), branch); - return this; - } - - public Builder immediateTableUpdate(boolean newImmediateUpdate) { - this.immediateUpdate = newImmediateUpdate; - return this; - } - - /** Maximum size of the caches used in Dynamic Sink for table data and serializers. */ - public Builder cacheMaxSize(int maxSize) { - this.cacheMaximumSize = maxSize; - return this; - } - - /** Maximum interval for cache items renewals. */ - public Builder cacheRefreshMs(long refreshMs) { - this.cacheRefreshMs = refreshMs; - return this; - } - - /** - * Maximum input {@link org.apache.iceberg.Schema} objects to cache per each Iceberg table. The - * cache improves Dynamic Sink performance by storing {@link org.apache.iceberg.Schema} - * comparison results. - */ - public Builder inputSchemasPerTableCacheMaxSize(int inputSchemasPerTableCacheMaxSize) { - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaxSize; - return this; - } - - private String operatorName(String suffix) { - return uidPrefix != null ? uidPrefix + "-" + suffix : suffix; - } - - private DynamicIcebergSink build() { - - Preconditions.checkArgument( - generator != null, "Please use withGenerator() to convert the input DataStream."); - Preconditions.checkNotNull(catalogLoader, "Catalog loader shouldn't be null"); - - FlinkWriteConf flinkWriteConf = new FlinkWriteConf(writeOptions, readableConfig); - Map writeProperties = - SinkUtil.writeProperties(flinkWriteConf.dataFileFormat(), flinkWriteConf, null); - uidPrefix = Optional.ofNullable(uidPrefix).orElse(""); - - return instantiateSink(writeProperties, flinkWriteConf); - } - - @VisibleForTesting - DynamicIcebergSink instantiateSink( - Map writeProperties, FlinkWriteConf flinkWriteConf) { - return new DynamicIcebergSink( - catalogLoader, - snapshotSummary, - uidPrefix, - writeProperties, - flinkWriteConf, - cacheMaximumSize); - } - - /** - * Append the iceberg sink operators to write records to iceberg table. - * - * @return {@link DataStreamSink} for sink. - */ - public DataStreamSink append() { - DynamicRecordInternalType type = - new DynamicRecordInternalType(catalogLoader, false, cacheMaximumSize); - DynamicIcebergSink sink = build(); - SingleOutputStreamOperator converted = - input - .process( - new DynamicRecordProcessor<>( - generator, - catalogLoader, - immediateUpdate, - cacheMaximumSize, - cacheRefreshMs, - inputSchemasPerTableCacheMaximumSize)) - .uid(prefixIfNotNull(uidPrefix, "-generator")) - .name(operatorName("generator")) - .returns(type); - - DataStreamSink rowDataDataStreamSink = - converted - .getSideOutput( - new OutputTag<>( - DynamicRecordProcessor.DYNAMIC_TABLE_UPDATE_STREAM, - new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize))) - .keyBy((KeySelector) DynamicRecordInternal::tableName) - .map( - new DynamicTableUpdateOperator( - catalogLoader, - cacheMaximumSize, - cacheRefreshMs, - inputSchemasPerTableCacheMaximumSize)) - .uid(prefixIfNotNull(uidPrefix, "-updater")) - .name(operatorName("Updater")) - .returns(type) - .union(converted) - .sinkTo(sink) - .uid(prefixIfNotNull(uidPrefix, "-sink")); - if (sink.flinkWriteConf.writeParallelism() != null) { - rowDataDataStreamSink.setParallelism(sink.flinkWriteConf.writeParallelism()); - } - - return rowDataDataStreamSink; - } - } - - DataStream distributeDataStream(DataStream input) { - return input.keyBy(DynamicRecordInternal::writerKey); - } - - private static String prefixIfNotNull(String uidPrefix, String suffix) { - return uidPrefix != null ? uidPrefix + "-" + suffix : suffix; - } - - /** - * Initialize a {@link IcebergSink.Builder} to export the data from input data stream with {@link - * RowData}s into iceberg table. - * - * @param input the source input data stream with {@link RowData}s. - * @return {@link IcebergSink.Builder} to connect the iceberg table. - */ - public static Builder forInput(DataStream input) { - return new Builder().forInput(input); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java deleted file mode 100644 index 600a4d8b950c..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.util.Set; -import javax.annotation.Nullable; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.catalog.TableIdentifier; - -/** A DynamicRecord contains RowData alongside with the Iceberg table metadata. */ -public class DynamicRecord { - - private TableIdentifier tableIdentifier; - private String branch; - private Schema schema; - private RowData rowData; - private PartitionSpec partitionSpec; - private DistributionMode distributionMode; - private int writeParallelism; - private boolean upsertMode; - @Nullable private Set equalityFields; - - public DynamicRecord( - TableIdentifier tableIdentifier, - String branch, - Schema schema, - RowData rowData, - PartitionSpec partitionSpec, - DistributionMode distributionMode, - int writeParallelism) { - this.tableIdentifier = tableIdentifier; - this.branch = branch; - this.schema = schema; - this.partitionSpec = partitionSpec; - this.rowData = rowData; - this.distributionMode = distributionMode; - this.writeParallelism = writeParallelism; - } - - public TableIdentifier tableIdentifier() { - return tableIdentifier; - } - - public void setTableIdentifier(TableIdentifier tableIdentifier) { - this.tableIdentifier = tableIdentifier; - } - - public String branch() { - return branch; - } - - public void setBranch(String branch) { - this.branch = branch; - } - - public Schema schema() { - return schema; - } - - public void setSchema(Schema schema) { - this.schema = schema; - } - - public PartitionSpec spec() { - return partitionSpec; - } - - public void setPartitionSpec(PartitionSpec partitionSpec) { - this.partitionSpec = partitionSpec; - } - - public RowData rowData() { - return rowData; - } - - public void setRowData(RowData rowData) { - this.rowData = rowData; - } - - public DistributionMode distributionMode() { - return distributionMode; - } - - public void setDistributionMode(DistributionMode distributionMode) { - this.distributionMode = distributionMode; - } - - public int writeParallelism() { - return writeParallelism; - } - - public void writeParallelism(int parallelism) { - this.writeParallelism = parallelism; - } - - public boolean upsertMode() { - return upsertMode; - } - - public void setUpsertMode(boolean upsertMode) { - this.upsertMode = upsertMode; - } - - public Set equalityFields() { - return equalityFields; - } - - public void setEqualityFields(Set equalityFields) { - this.equalityFields = equalityFields; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordGenerator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordGenerator.java deleted file mode 100644 index 23319b37d1ba..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordGenerator.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.io.Serializable; -import org.apache.flink.api.common.functions.OpenContext; -import org.apache.flink.util.Collector; - -/** A generator to yield {@link DynamicRecord} from the provided input. */ -public interface DynamicRecordGenerator extends Serializable { - default void open(OpenContext openContext) throws Exception {} - - /** - * Takes the user-defined input and yields zero, one, or multiple {@link DynamicRecord}s using the - * {@link Collector}. - */ - void generate(T inputRecord, Collector out) throws Exception; -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternal.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternal.java deleted file mode 100644 index fe1f4cdac9b7..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternal.java +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.util.Objects; -import java.util.Set; -import org.apache.flink.annotation.Internal; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.runtime.typeutils.RowDataSerializer; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; - -@Internal -class DynamicRecordInternal { - - private String tableName; - private String branch; - private Schema schema; - private PartitionSpec spec; - private int writerKey; - private RowData rowData; - private boolean upsertMode; - private Set equalityFieldIds; - - // Required for serialization instantiation - DynamicRecordInternal() {} - - DynamicRecordInternal( - String tableName, - String branch, - Schema schema, - RowData rowData, - PartitionSpec spec, - int writerKey, - boolean upsertMode, - Set equalityFieldsIds) { - this.tableName = tableName; - this.branch = branch; - this.schema = schema; - this.spec = spec; - this.writerKey = writerKey; - this.rowData = rowData; - this.upsertMode = upsertMode; - this.equalityFieldIds = equalityFieldsIds; - } - - public String tableName() { - return tableName; - } - - public void setTableName(String tableName) { - this.tableName = tableName; - } - - public String branch() { - return branch; - } - - public void setBranch(String branch) { - this.branch = branch; - } - - public Schema schema() { - return schema; - } - - public void setSchema(Schema schema) { - this.schema = schema; - } - - public RowData rowData() { - return rowData; - } - - public void setRowData(RowData rowData) { - this.rowData = rowData; - } - - public PartitionSpec spec() { - return spec; - } - - public void setSpec(PartitionSpec spec) { - this.spec = spec; - } - - public int writerKey() { - return writerKey; - } - - public void setWriterKey(int writerKey) { - this.writerKey = writerKey; - } - - public boolean upsertMode() { - return upsertMode; - } - - public void setUpsertMode(boolean upsertMode) { - this.upsertMode = upsertMode; - } - - public Set equalityFields() { - return equalityFieldIds; - } - - public void setEqualityFieldIds(Set equalityFieldIds) { - this.equalityFieldIds = equalityFieldIds; - } - - @Override - public int hashCode() { - return Objects.hash( - tableName, branch, schema, spec, writerKey, rowData, upsertMode, equalityFieldIds); - } - - @Override - public boolean equals(Object other) { - if (this == other) { - return true; - } - - if (other == null || getClass() != other.getClass()) { - return false; - } - - DynamicRecordInternal that = (DynamicRecordInternal) other; - boolean tableFieldsMatch = - Objects.equals(tableName, that.tableName) - && Objects.equals(branch, that.branch) - && schema.schemaId() == that.schema.schemaId() - && Objects.equals(spec, that.spec) - && writerKey == that.writerKey - && upsertMode == that.upsertMode - && Objects.equals(equalityFieldIds, that.equalityFieldIds); - if (!tableFieldsMatch) { - return false; - } - - if (rowData.getClass().equals(that.rowData.getClass())) { - return Objects.equals(rowData, that.rowData); - } else { - RowDataSerializer rowDataSerializer = new RowDataSerializer(FlinkSchemaUtil.convert(schema)); - return rowDataSerializer - .toBinaryRow(rowData) - .equals(rowDataSerializer.toBinaryRow(that.rowData)); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializer.java deleted file mode 100644 index b139d9a898bf..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializer.java +++ /dev/null @@ -1,295 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.io.IOException; -import java.util.Collections; -import java.util.Set; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.api.common.typeutils.TypeSerializerSchemaCompatibility; -import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; -import org.apache.flink.api.java.tuple.Tuple3; -import org.apache.flink.core.memory.DataInputView; -import org.apache.flink.core.memory.DataOutputView; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.runtime.typeutils.RowDataSerializer; -import org.apache.hadoop.util.Sets; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.PartitionSpecParser; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SchemaParser; - -@Internal -class DynamicRecordInternalSerializer extends TypeSerializer { - - private static final long serialVersionUID = 1L; - - private final TableSerializerCache serializerCache; - private final boolean writeSchemaAndSpec; - - DynamicRecordInternalSerializer( - TableSerializerCache serializerCache, boolean writeSchemaAndSpec) { - this.serializerCache = serializerCache; - this.writeSchemaAndSpec = writeSchemaAndSpec; - } - - @Override - public TypeSerializer duplicate() { - return new DynamicRecordInternalSerializer( - new TableSerializerCache(serializerCache.catalogLoader(), serializerCache.maximumSize()), - writeSchemaAndSpec); - } - - @Override - public DynamicRecordInternal createInstance() { - return new DynamicRecordInternal(); - } - - @Override - public void serialize(DynamicRecordInternal toSerialize, DataOutputView dataOutputView) - throws IOException { - dataOutputView.writeUTF(toSerialize.tableName()); - dataOutputView.writeUTF(toSerialize.branch()); - if (writeSchemaAndSpec) { - dataOutputView.writeUTF(SchemaParser.toJson(toSerialize.schema())); - dataOutputView.writeUTF(PartitionSpecParser.toJson(toSerialize.spec())); - } else { - dataOutputView.writeInt(toSerialize.schema().schemaId()); - dataOutputView.writeInt(toSerialize.spec().specId()); - } - - dataOutputView.writeInt(toSerialize.writerKey()); - final RowDataSerializer rowDataSerializer; - if (writeSchemaAndSpec) { - rowDataSerializer = - serializerCache.serializer( - toSerialize.tableName(), toSerialize.schema(), toSerialize.spec()); - } else { - // Check that the schema id can be resolved. Not strictly necessary for serialization. - Tuple3 serializer = - serializerCache.serializerWithSchemaAndSpec( - toSerialize.tableName(), - toSerialize.schema().schemaId(), - toSerialize.spec().specId()); - rowDataSerializer = serializer.f0; - } - - rowDataSerializer.serialize(toSerialize.rowData(), dataOutputView); - dataOutputView.writeBoolean(toSerialize.upsertMode()); - dataOutputView.writeInt(toSerialize.equalityFields().size()); - for (Integer equalityField : toSerialize.equalityFields()) { - dataOutputView.writeInt(equalityField); - } - } - - @Override - public DynamicRecordInternal deserialize(DataInputView dataInputView) throws IOException { - String tableName = dataInputView.readUTF(); - String branch = dataInputView.readUTF(); - - final Schema schema; - final PartitionSpec spec; - final RowDataSerializer rowDataSerializer; - if (writeSchemaAndSpec) { - schema = SchemaParser.fromJson(dataInputView.readUTF()); - spec = PartitionSpecParser.fromJson(schema, dataInputView.readUTF()); - rowDataSerializer = serializerCache.serializer(tableName, schema, spec); - } else { - Integer schemaId = dataInputView.readInt(); - Integer specId = dataInputView.readInt(); - Tuple3 serializerWithSchemaAndSpec = - serializerCache.serializerWithSchemaAndSpec(tableName, schemaId, specId); - schema = serializerWithSchemaAndSpec.f1; - spec = serializerWithSchemaAndSpec.f2; - rowDataSerializer = serializerWithSchemaAndSpec.f0; - } - - int writerKey = dataInputView.readInt(); - RowData rowData = rowDataSerializer.deserialize(dataInputView); - boolean upsertMode = dataInputView.readBoolean(); - int numEqualityFields = dataInputView.readInt(); - final Set equalityFieldIds; - if (numEqualityFields > 0) { - equalityFieldIds = Sets.newHashSetWithExpectedSize(numEqualityFields); - } else { - equalityFieldIds = Collections.emptySet(); - } - - for (int i = 0; i < numEqualityFields; i++) { - equalityFieldIds.add(dataInputView.readInt()); - } - - return new DynamicRecordInternal( - tableName, branch, schema, rowData, spec, writerKey, upsertMode, equalityFieldIds); - } - - @Override - public DynamicRecordInternal deserialize(DynamicRecordInternal reuse, DataInputView dataInputView) - throws IOException { - String tableName = dataInputView.readUTF(); - reuse.setTableName(tableName); - String branch = dataInputView.readUTF(); - reuse.setBranch(branch); - - final Schema schema; - final PartitionSpec spec; - final RowDataSerializer rowDataSerializer; - if (writeSchemaAndSpec) { - schema = SchemaParser.fromJson(dataInputView.readUTF()); - spec = PartitionSpecParser.fromJson(schema, dataInputView.readUTF()); - reuse.setSchema(schema); - reuse.setSpec(spec); - rowDataSerializer = serializerCache.serializer(tableName, schema, spec); - } else { - Integer schemaId = dataInputView.readInt(); - Integer specId = dataInputView.readInt(); - Tuple3 serializerWithSchemaAndSpec = - serializerCache.serializerWithSchemaAndSpec(tableName, schemaId, specId); - schema = serializerWithSchemaAndSpec.f1; - spec = serializerWithSchemaAndSpec.f2; - rowDataSerializer = serializerWithSchemaAndSpec.f0; - } - - int writerKey = dataInputView.readInt(); - reuse.setWriterKey(writerKey); - RowData rowData = rowDataSerializer.deserialize(dataInputView); - boolean upsertMode = dataInputView.readBoolean(); - int numEqualityFields = dataInputView.readInt(); - final Set equalityFieldIds; - if (numEqualityFields > 0) { - equalityFieldIds = Sets.newHashSetWithExpectedSize(numEqualityFields); - } else { - equalityFieldIds = Collections.emptySet(); - } - for (int i = 0; i < numEqualityFields; i++) { - equalityFieldIds.add(dataInputView.readInt()); - } - return new DynamicRecordInternal( - tableName, branch, schema, rowData, spec, writerKey, upsertMode, equalityFieldIds); - } - - @Override - public DynamicRecordInternal copy(DynamicRecordInternal from) { - return new DynamicRecordInternal( - from.tableName(), - from.branch(), - from.schema(), - from.rowData(), - from.spec(), - from.writerKey(), - from.upsertMode(), - from.equalityFields()); - } - - @Override - public DynamicRecordInternal copy(DynamicRecordInternal from, DynamicRecordInternal reuse) { - reuse.setTableName(from.tableName()); - reuse.setBranch(from.branch()); - reuse.setSchema(from.schema()); - reuse.setSpec(from.spec()); - reuse.setWriterKey(from.writerKey()); - reuse.setRowData(from.rowData()); - reuse.setUpsertMode(from.upsertMode()); - reuse.setEqualityFieldIds(from.equalityFields()); - return reuse; - } - - @Override - public void copy(DataInputView source, DataOutputView target) throws IOException { - serialize(deserialize(source), target); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (obj instanceof DynamicRecordInternalSerializer) { - DynamicRecordInternalSerializer other = (DynamicRecordInternalSerializer) obj; - return writeSchemaAndSpec == other.writeSchemaAndSpec; - } - return false; - } - - @Override - public int hashCode() { - return Boolean.hashCode(writeSchemaAndSpec); - } - - @Override - public boolean isImmutableType() { - return false; - } - - @Override - public int getLength() { - return -1; - } - - @Override - public TypeSerializerSnapshot snapshotConfiguration() { - return new DynamicRecordInternalTypeSerializerSnapshot(writeSchemaAndSpec); - } - - public static class DynamicRecordInternalTypeSerializerSnapshot - implements TypeSerializerSnapshot { - - private boolean writeSchemaAndSpec; - - // Zero args constructor is required to instantiate this class on restore - @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) - public DynamicRecordInternalTypeSerializerSnapshot() {} - - DynamicRecordInternalTypeSerializerSnapshot(boolean writeSchemaAndSpec) { - this.writeSchemaAndSpec = writeSchemaAndSpec; - } - - @Override - public int getCurrentVersion() { - return 0; - } - - @Override - public void writeSnapshot(DataOutputView out) throws IOException { - out.writeBoolean(writeSchemaAndSpec); - } - - @Override - public void readSnapshot(int readVersion, DataInputView in, ClassLoader userCodeClassLoader) - throws IOException { - this.writeSchemaAndSpec = in.readBoolean(); - } - - @Override - public TypeSerializerSchemaCompatibility resolveSchemaCompatibility( - TypeSerializerSnapshot oldSerializerSnapshot) { - return TypeSerializerSchemaCompatibility.compatibleAsIs(); - } - - @Override - public TypeSerializer restoreSerializer() { - // Note: We pass in a null serializer cache which would create issues if we tried to use this - // restored serializer, but since we are using {@code - // TypeSerializerSchemaCompatibility.compatibleAsIs()} above, this serializer will never be - // used. A new one will be created via {@code DynamicRecordInternalType}. - return new DynamicRecordInternalSerializer(null, writeSchemaAndSpec); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalType.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalType.java deleted file mode 100644 index 6be081aadf77..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalType.java +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.common.serialization.SerializerConfig; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.iceberg.flink.CatalogLoader; - -@Internal -class DynamicRecordInternalType extends TypeInformation { - - private final CatalogLoader catalogLoader; - private final boolean writeSchemaAndSpec; - private final int cacheSize; - - DynamicRecordInternalType( - CatalogLoader catalogLoader, boolean writeSchemaAndSpec, int cacheSize) { - this.catalogLoader = catalogLoader; - this.writeSchemaAndSpec = writeSchemaAndSpec; - this.cacheSize = cacheSize; - } - - @Override - public boolean isBasicType() { - return false; - } - - @Override - public boolean isTupleType() { - return false; - } - - @Override - public int getArity() { - return 0; - } - - @Override - public int getTotalFields() { - return 1; - } - - @Override - public Class getTypeClass() { - return DynamicRecordInternal.class; - } - - @Override - public boolean isKeyType() { - return false; - } - - @Override - public TypeSerializer createSerializer(SerializerConfig serializerConfig) { - return new DynamicRecordInternalSerializer( - new TableSerializerCache(catalogLoader, cacheSize), writeSchemaAndSpec); - } - - @Override - public TypeSerializer createSerializer(ExecutionConfig executionConfig) { - return new DynamicRecordInternalSerializer( - new TableSerializerCache(catalogLoader, cacheSize), writeSchemaAndSpec); - } - - @Override - public String toString() { - return getClass().getName(); - } - - @Override - public boolean equals(Object o) { - return canEqual(o); - } - - @Override - public int hashCode() { - return getClass().getName().hashCode(); - } - - @Override - public boolean canEqual(Object o) { - return o instanceof DynamicRecordInternalType; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java deleted file mode 100644 index 166217a0140e..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.api.common.functions.OpenContext; -import org.apache.flink.api.java.tuple.Tuple2; -import org.apache.flink.streaming.api.functions.ProcessFunction; -import org.apache.flink.table.data.RowData; -import org.apache.flink.util.Collector; -import org.apache.flink.util.OutputTag; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.flink.CatalogLoader; - -@Internal -class DynamicRecordProcessor extends ProcessFunction - implements Collector { - @VisibleForTesting - static final String DYNAMIC_TABLE_UPDATE_STREAM = "dynamic-table-update-stream"; - - private final DynamicRecordGenerator generator; - private final CatalogLoader catalogLoader; - private final boolean immediateUpdate; - private final int cacheMaximumSize; - private final long cacheRefreshMs; - private final int inputSchemasPerTableCacheMaximumSize; - - private transient TableMetadataCache tableCache; - private transient HashKeyGenerator hashKeyGenerator; - private transient TableUpdater updater; - private transient OutputTag updateStream; - private transient Collector collector; - private transient Context context; - - DynamicRecordProcessor( - DynamicRecordGenerator generator, - CatalogLoader catalogLoader, - boolean immediateUpdate, - int cacheMaximumSize, - long cacheRefreshMs, - int inputSchemasPerTableCacheMaximumSize) { - this.generator = generator; - this.catalogLoader = catalogLoader; - this.immediateUpdate = immediateUpdate; - this.cacheMaximumSize = cacheMaximumSize; - this.cacheRefreshMs = cacheRefreshMs; - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaximumSize; - } - - @Override - public void open(OpenContext openContext) throws Exception { - super.open(openContext); - Catalog catalog = catalogLoader.loadCatalog(); - this.tableCache = - new TableMetadataCache( - catalog, cacheMaximumSize, cacheRefreshMs, inputSchemasPerTableCacheMaximumSize); - this.hashKeyGenerator = - new HashKeyGenerator( - cacheMaximumSize, getRuntimeContext().getTaskInfo().getMaxNumberOfParallelSubtasks()); - if (immediateUpdate) { - updater = new TableUpdater(tableCache, catalog); - } else { - updateStream = - new OutputTag<>( - DYNAMIC_TABLE_UPDATE_STREAM, - new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize)) {}; - } - - generator.open(openContext); - } - - @Override - public void processElement(T element, Context ctx, Collector out) - throws Exception { - this.context = ctx; - this.collector = out; - generator.generate(element, this); - } - - @Override - public void collect(DynamicRecord data) { - boolean exists = tableCache.exists(data.tableIdentifier()).f0; - String foundBranch = exists ? tableCache.branch(data.tableIdentifier(), data.branch()) : null; - - TableMetadataCache.ResolvedSchemaInfo foundSchema = - exists - ? tableCache.schema(data.tableIdentifier(), data.schema()) - : TableMetadataCache.NOT_FOUND; - - PartitionSpec foundSpec = exists ? tableCache.spec(data.tableIdentifier(), data.spec()) : null; - - if (!exists - || foundBranch == null - || foundSpec == null - || foundSchema.compareResult() == CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED) { - if (immediateUpdate) { - Tuple2 newData = - updater.update(data.tableIdentifier(), data.branch(), data.schema(), data.spec()); - emit( - collector, - data, - newData.f0.resolvedTableSchema(), - newData.f0.recordConverter(), - newData.f1); - } else { - int writerKey = - hashKeyGenerator.generateKey( - data, - foundSchema.resolvedTableSchema() != null - ? foundSchema.resolvedTableSchema() - : data.schema(), - foundSpec != null ? foundSpec : data.spec(), - data.rowData()); - context.output( - updateStream, - new DynamicRecordInternal( - data.tableIdentifier().toString(), - data.branch(), - data.schema(), - data.rowData(), - data.spec(), - writerKey, - data.upsertMode(), - DynamicSinkUtil.getEqualityFieldIds(data.equalityFields(), data.schema()))); - } - } else { - emit( - collector, - data, - foundSchema.resolvedTableSchema(), - foundSchema.recordConverter(), - foundSpec); - } - } - - private void emit( - Collector out, - DynamicRecord data, - Schema schema, - DataConverter recordConverter, - PartitionSpec spec) { - RowData rowData = (RowData) recordConverter.convert(data.rowData()); - int writerKey = hashKeyGenerator.generateKey(data, schema, spec, rowData); - String tableName = data.tableIdentifier().toString(); - out.collect( - new DynamicRecordInternal( - tableName, - data.branch(), - schema, - rowData, - spec, - writerKey, - data.upsertMode(), - DynamicSinkUtil.getEqualityFieldIds(data.equalityFields(), schema))); - } - - @Override - public void close() { - try { - super.close(); - } catch (Exception e) { - throw new RuntimeException(e); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicSinkUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicSinkUtil.java deleted file mode 100644 index 6ea6dcab867a..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicSinkUtil.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.util.Collections; -import java.util.Set; -import org.apache.hadoop.util.Sets; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.types.Types; - -class DynamicSinkUtil { - - private DynamicSinkUtil() {} - - static Set getEqualityFieldIds(Set equalityFields, Schema schema) { - if (equalityFields == null || equalityFields.isEmpty()) { - if (!schema.identifierFieldIds().isEmpty()) { - return schema.identifierFieldIds(); - } else { - return Collections.emptySet(); - } - } - - Set equalityFieldIds = Sets.newHashSetWithExpectedSize(equalityFields.size()); - for (String equalityField : equalityFields) { - Types.NestedField field = schema.findField(equalityField); - Preconditions.checkNotNull( - field, "Equality field %s does not exist in schema", equalityField); - equalityFieldIds.add(field.fieldId()); - } - - return equalityFieldIds; - } - - static int safeAbs(int input) { - if (input >= 0) { - return input; - } - - if (input == Integer.MIN_VALUE) { - // -Integer.MIN_VALUE would be Integer.MIN_VALUE due to integer overflow. Map to - // Integer.MAX_VALUE instead! - return Integer.MAX_VALUE; - } - - return -input; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java deleted file mode 100644 index 6057d773c3f0..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.functions.OpenContext; -import org.apache.flink.api.common.functions.RichMapFunction; -import org.apache.flink.api.java.tuple.Tuple2; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.CatalogLoader; - -/** - * An optional operator to perform table updates for tables (e.g. schema update) in a non-concurrent - * way. Records must be keyed / routed to this operator by table name to ensure non-concurrent - * updates. The operator itself forwards the record after updating schema / spec of the table. The - * update is also reflected in the record. - */ -@Internal -class DynamicTableUpdateOperator - extends RichMapFunction { - private final CatalogLoader catalogLoader; - private final int cacheMaximumSize; - private final long cacheRefreshMs; - private final int inputSchemasPerTableCacheMaximumSize; - - private transient TableUpdater updater; - - DynamicTableUpdateOperator( - CatalogLoader catalogLoader, - int cacheMaximumSize, - long cacheRefreshMs, - int inputSchemasPerTableCacheMaximumSize) { - this.catalogLoader = catalogLoader; - this.cacheMaximumSize = cacheMaximumSize; - this.cacheRefreshMs = cacheRefreshMs; - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaximumSize; - } - - @Override - public void open(OpenContext openContext) throws Exception { - super.open(openContext); - Catalog catalog = catalogLoader.loadCatalog(); - this.updater = - new TableUpdater( - new TableMetadataCache( - catalog, cacheMaximumSize, cacheRefreshMs, inputSchemasPerTableCacheMaximumSize), - catalog); - } - - @Override - public DynamicRecordInternal map(DynamicRecordInternal data) throws Exception { - Tuple2 newData = - updater.update( - TableIdentifier.parse(data.tableName()), data.branch(), data.schema(), data.spec()); - TableMetadataCache.ResolvedSchemaInfo compareInfo = newData.f0; - - data.setSchema(compareInfo.resolvedTableSchema()); - data.setSpec(newData.f1); - - RowData newRowData = (RowData) newData.f0.recordConverter().convert(data.rowData()); - data.setRowData(newRowData); - - return data; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResult.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResult.java deleted file mode 100644 index 85806f932ad5..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResult.java +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import org.apache.iceberg.io.WriteResult; - -class DynamicWriteResult { - - private final WriteTarget key; - private final WriteResult writeResult; - - DynamicWriteResult(WriteTarget key, WriteResult writeResult) { - this.key = key; - this.writeResult = writeResult; - } - - WriteTarget key() { - return key; - } - - WriteResult writeResult() { - return writeResult; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultAggregator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultAggregator.java deleted file mode 100644 index 58ba183dfcd4..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultAggregator.java +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import com.github.benmanes.caffeine.cache.Cache; -import com.github.benmanes.caffeine.cache.Caffeine; -import java.io.IOException; -import java.time.Duration; -import java.util.Collection; -import java.util.Map; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.core.io.SimpleVersionedSerialization; -import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; -import org.apache.flink.streaming.api.connector.sink2.CommittableSummary; -import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.CatalogLoader; -import org.apache.iceberg.flink.sink.DeltaManifests; -import org.apache.iceberg.flink.sink.DeltaManifestsSerializer; -import org.apache.iceberg.flink.sink.FlinkManifestUtil; -import org.apache.iceberg.flink.sink.ManifestOutputFileFactory; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Operator which aggregates the individual {@link WriteResult} objects to a single {@link - * DynamicCommittable} per checkpoint (storing the serialized {@link DeltaManifests}, jobId, - * operatorId, checkpointId) - */ -class DynamicWriteResultAggregator - extends AbstractStreamOperator> - implements OneInputStreamOperator< - CommittableMessage, CommittableMessage> { - private static final Logger LOG = LoggerFactory.getLogger(DynamicWriteResultAggregator.class); - private static final byte[] EMPTY_MANIFEST_DATA = new byte[0]; - private static final Duration CACHE_EXPIRATION_DURATION = Duration.ofMinutes(1); - - private final CatalogLoader catalogLoader; - private transient Map> results; - private transient Cache> specs; - private transient Cache outputFileFactories; - private transient String flinkJobId; - private transient String operatorId; - private transient int subTaskId; - private transient int attemptId; - private transient Catalog catalog; - - DynamicWriteResultAggregator(CatalogLoader catalogLoader) { - this.catalogLoader = catalogLoader; - } - - @Override - public void open() throws Exception { - this.flinkJobId = getContainingTask().getEnvironment().getJobID().toString(); - this.operatorId = getOperatorID().toString(); - this.subTaskId = getRuntimeContext().getTaskInfo().getIndexOfThisSubtask(); - this.attemptId = getRuntimeContext().getTaskInfo().getAttemptNumber(); - this.results = Maps.newHashMap(); - this.specs = - Caffeine.newBuilder().expireAfterWrite(CACHE_EXPIRATION_DURATION).softValues().build(); - this.outputFileFactories = - Caffeine.newBuilder().expireAfterWrite(CACHE_EXPIRATION_DURATION).softValues().build(); - this.catalog = catalogLoader.loadCatalog(); - } - - @Override - public void finish() throws IOException { - prepareSnapshotPreBarrier(Long.MAX_VALUE); - } - - @Override - public void prepareSnapshotPreBarrier(long checkpointId) throws IOException { - Collection> committables = - Sets.newHashSetWithExpectedSize(results.size()); - int count = 0; - for (Map.Entry> entries : results.entrySet()) { - committables.add( - new CommittableWithLineage<>( - new DynamicCommittable( - entries.getKey(), - writeToManifest(entries.getKey(), entries.getValue(), checkpointId), - getContainingTask().getEnvironment().getJobID().toString(), - getRuntimeContext().getOperatorUniqueID(), - checkpointId), - checkpointId, - count)); - ++count; - } - - output.collect( - new StreamRecord<>( - new CommittableSummary<>(subTaskId, count, checkpointId, count, count, 0))); - committables.forEach( - c -> - output.collect( - new StreamRecord<>( - new CommittableWithLineage<>(c.getCommittable(), checkpointId, subTaskId)))); - LOG.info("Emitted {} commit message to downstream committer operator", count); - results.clear(); - } - - /** - * Write all the completed data files to a newly created manifest file and return the manifest's - * avro serialized bytes. - */ - @VisibleForTesting - byte[] writeToManifest( - WriteTarget key, Collection writeResults, long checkpointId) - throws IOException { - if (writeResults.isEmpty()) { - return EMPTY_MANIFEST_DATA; - } - - WriteResult.Builder builder = WriteResult.builder(); - writeResults.forEach(w -> builder.add(w.writeResult())); - WriteResult result = builder.build(); - - DeltaManifests deltaManifests = - FlinkManifestUtil.writeCompletedFiles( - result, - () -> outputFileFactory(key.tableName()).create(checkpointId), - spec(key.tableName(), key.specId())); - - return SimpleVersionedSerialization.writeVersionAndSerialize( - DeltaManifestsSerializer.INSTANCE, deltaManifests); - } - - @Override - public void processElement(StreamRecord> element) - throws Exception { - - if (element.isRecord() && element.getValue() instanceof CommittableWithLineage) { - DynamicWriteResult result = - ((CommittableWithLineage) element.getValue()).getCommittable(); - WriteTarget key = result.key(); - results.computeIfAbsent(key, unused -> Sets.newHashSet()).add(result); - } - } - - private ManifestOutputFileFactory outputFileFactory(String tableName) { - return outputFileFactories.get( - tableName, - unused -> { - Table table = catalog.loadTable(TableIdentifier.parse(tableName)); - specs.put(tableName, table.specs()); - return FlinkManifestUtil.createOutputFileFactory( - () -> table, table.properties(), flinkJobId, operatorId, subTaskId, attemptId); - }); - } - - private PartitionSpec spec(String tableName, int specId) { - Map knownSpecs = specs.getIfPresent(tableName); - if (knownSpecs != null) { - PartitionSpec spec = knownSpecs.get(specId); - if (spec != null) { - return spec; - } - } - - Table table = catalog.loadTable(TableIdentifier.parse(tableName)); - return table.specs().get(specId); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultSerializer.java deleted file mode 100644 index cf5f423fd7ff..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultSerializer.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputViewStreamWrapper; -import org.apache.iceberg.flink.sink.WriteResultSerializer; -import org.apache.iceberg.io.WriteResult; - -class DynamicWriteResultSerializer implements SimpleVersionedSerializer { - - private static final int VERSION = 1; - private static final WriteResultSerializer WRITE_RESULT_SERIALIZER = new WriteResultSerializer(); - - @Override - public int getVersion() { - return VERSION; - } - - @Override - public byte[] serialize(DynamicWriteResult writeResult) throws IOException { - ByteArrayOutputStream out = new ByteArrayOutputStream(); - DataOutputViewStreamWrapper view = new DataOutputViewStreamWrapper(out); - writeResult.key().serializeTo(view); - byte[] result = WRITE_RESULT_SERIALIZER.serialize(writeResult.writeResult()); - view.write(result); - return out.toByteArray(); - } - - @Override - public DynamicWriteResult deserialize(int version, byte[] serialized) throws IOException { - if (version == 1) { - DataInputDeserializer view = new DataInputDeserializer(serialized); - WriteTarget key = WriteTarget.deserializeFrom(view); - byte[] resultBuf = new byte[view.available()]; - view.read(resultBuf); - WriteResult writeResult = WRITE_RESULT_SERIALIZER.deserialize(version, resultBuf); - return new DynamicWriteResult(key, writeResult); - } - - throw new IOException("Unrecognized version or corrupt state: " + version); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriter.java deleted file mode 100644 index ae24efafa6af..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriter.java +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.io.IOException; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.TimeUnit; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.api.connector.sink2.CommittingSinkWriter; -import org.apache.flink.api.connector.sink2.SinkWriter; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionField; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.sink.RowDataTaskWriterFactory; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Iceberg writer implementation for the {@link SinkWriter} interface. Used by the - * DynamicIcebergSink. Writes out the data to the final place, and emits {@link DynamicWriteResult} - * for every unique {@link WriteTarget} at checkpoint time. - */ -class DynamicWriter implements CommittingSinkWriter { - - private static final Logger LOG = LoggerFactory.getLogger(DynamicWriter.class); - - private final Map taskWriterFactories; - private final Map> writers; - private final DynamicWriterMetrics metrics; - private final int subTaskId; - private final int attemptId; - private final Catalog catalog; - private final FileFormat dataFileFormat; - private final long targetDataFileSize; - private final Map commonWriteProperties; - - DynamicWriter( - Catalog catalog, - FileFormat dataFileFormat, - long targetDataFileSize, - Map commonWriteProperties, - int cacheMaximumSize, - DynamicWriterMetrics metrics, - int subTaskId, - int attemptId) { - this.catalog = catalog; - this.dataFileFormat = dataFileFormat; - this.targetDataFileSize = targetDataFileSize; - this.commonWriteProperties = commonWriteProperties; - this.metrics = metrics; - this.subTaskId = subTaskId; - this.attemptId = attemptId; - this.taskWriterFactories = new LRUCache<>(cacheMaximumSize); - this.writers = Maps.newHashMap(); - - LOG.debug("DynamicIcebergSinkWriter created for subtask {} attemptId {}", subTaskId, attemptId); - } - - @Override - public void write(DynamicRecordInternal element, Context context) - throws IOException, InterruptedException { - writers - .computeIfAbsent( - new WriteTarget( - element.tableName(), - element.branch(), - element.schema().schemaId(), - element.spec().specId(), - element.upsertMode(), - element.equalityFields()), - writerKey -> { - RowDataTaskWriterFactory taskWriterFactory = - taskWriterFactories.computeIfAbsent( - writerKey, - factoryKey -> { - Table table = - catalog.loadTable(TableIdentifier.parse(factoryKey.tableName())); - - Map tableWriteProperties = - Maps.newHashMap(table.properties()); - tableWriteProperties.putAll(commonWriteProperties); - - Set equalityFieldIds = - getEqualityFields(table, element.equalityFields()); - if (element.upsertMode()) { - Preconditions.checkState( - !equalityFieldIds.isEmpty(), - "Equality field columns shouldn't be empty when configuring to use UPSERT data."); - if (!table.spec().isUnpartitioned()) { - for (PartitionField partitionField : table.spec().fields()) { - Preconditions.checkState( - equalityFieldIds.contains(partitionField.sourceId()), - "In UPSERT mode, partition field '%s' should be included in equality fields: '%s'", - partitionField, - equalityFieldIds); - } - } - } - - LOG.debug("Creating new writer factory for table '{}'", table.name()); - return new RowDataTaskWriterFactory( - () -> table, - FlinkSchemaUtil.convert(element.schema()), - targetDataFileSize, - dataFileFormat, - tableWriteProperties, - Lists.newArrayList(equalityFieldIds), - element.upsertMode(), - element.schema(), - element.spec()); - }); - - taskWriterFactory.initialize(subTaskId, attemptId); - return taskWriterFactory.create(); - }) - .write(element.rowData()); - } - - @Override - public void flush(boolean endOfInput) { - // flush is used to handle flush/endOfInput, so no action is taken here. - } - - @Override - public void close() throws Exception { - for (TaskWriter writer : writers.values()) { - writer.close(); - } - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("subtaskId", subTaskId) - .add("attemptId", attemptId) - .add("dataFileFormat", dataFileFormat) - .add("targetDataFileSize", targetDataFileSize) - .add("writeProperties", commonWriteProperties) - .toString(); - } - - @Override - public Collection prepareCommit() throws IOException { - List result = Lists.newArrayList(); - for (Map.Entry> entry : writers.entrySet()) { - long startNano = System.nanoTime(); - WriteResult writeResult = entry.getValue().complete(); - WriteTarget writeTarget = entry.getKey(); - metrics.updateFlushResult(writeTarget.tableName(), writeResult); - metrics.flushDuration( - writeTarget.tableName(), TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano)); - LOG.debug( - "Iceberg writer for table {} subtask {} attempt {} flushed {} data files and {} delete files", - writeTarget.tableName(), - subTaskId, - attemptId, - writeResult.dataFiles().length, - writeResult.deleteFiles().length); - - result.add(new DynamicWriteResult(writeTarget, writeResult)); - } - - writers.clear(); - - return result; - } - - private static Set getEqualityFields(Table table, Set equalityFieldIds) { - if (equalityFieldIds != null && !equalityFieldIds.isEmpty()) { - return equalityFieldIds; - } - Set identifierFieldIds = table.schema().identifierFieldIds(); - if (identifierFieldIds != null && !identifierFieldIds.isEmpty()) { - return identifierFieldIds; - } - return Collections.emptySet(); - } - - @VisibleForTesting - DynamicWriterMetrics getMetrics() { - return metrics; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriterMetrics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriterMetrics.java deleted file mode 100644 index 2e1f82df9d2d..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriterMetrics.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.util.Map; -import org.apache.flink.metrics.MetricGroup; -import org.apache.iceberg.flink.sink.IcebergStreamWriterMetrics; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -class DynamicWriterMetrics { - - private final Map metrics; - private final MetricGroup mainMetricsGroup; - - DynamicWriterMetrics(MetricGroup mainMetricsGroup) { - this.mainMetricsGroup = mainMetricsGroup; - this.metrics = Maps.newHashMap(); - } - - public void updateFlushResult(String fullTableName, WriteResult result) { - writerMetrics(fullTableName).updateFlushResult(result); - } - - public void flushDuration(String fullTableName, long flushDurationMs) { - writerMetrics(fullTableName).flushDuration(flushDurationMs); - } - - IcebergStreamWriterMetrics writerMetrics(String fullTableName) { - return metrics.computeIfAbsent( - fullTableName, tableName -> new IcebergStreamWriterMetrics(mainMetricsGroup, tableName)); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/EvolveSchemaVisitor.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/EvolveSchemaVisitor.java deleted file mode 100644 index ee0549997178..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/EvolveSchemaVisitor.java +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.util.List; -import org.apache.iceberg.Schema; -import org.apache.iceberg.UpdateSchema; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.schema.SchemaWithPartnerVisitor; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; - -/** - * Visitor class that accumulates the set of changes needed to evolve an existing schema into the - * target schema. Changes are applied to an {@link UpdateSchema} operation. - * - *

    We support: - * - *

      - *
    • Adding new columns - *
    • Widening the type of existing columsn - *
    • Reordering columns - *
    - * - * We don't support: - * - *
      - *
    • Dropping columns - *
    • Renaming columns - *
    - * - * The reason is that dropping columns would create issues with late / out of order data. Once we - * drop fields, we wouldn't be able to easily add them back later without losing the associated - * data. Renaming columns is not supported because we compare schemas by name, which doesn't allow - * for renaming without additional hints. - */ -public class EvolveSchemaVisitor extends SchemaWithPartnerVisitor { - - private final UpdateSchema api; - private final Schema existingSchema; - private final Schema targetSchema; - - private EvolveSchemaVisitor(UpdateSchema api, Schema existingSchema, Schema targetSchema) { - this.api = api; - this.existingSchema = existingSchema; - this.targetSchema = targetSchema; - } - - /** - * Adds changes needed to produce the target schema to an {@link UpdateSchema} operation. - * - *

    Changes are accumulated to evolve the existingSchema into a targetSchema. - * - * @param api an UpdateSchema for adding changes - * @param existingSchema an existing schema - * @param targetSchema a new schema to compare with the existing - */ - public static void visit(UpdateSchema api, Schema existingSchema, Schema targetSchema) { - visit( - targetSchema, - -1, - new EvolveSchemaVisitor(api, existingSchema, targetSchema), - new CompareSchemasVisitor.PartnerIdByNameAccessors(existingSchema)); - } - - @Override - public Boolean struct(Types.StructType struct, Integer partnerId, List existingFields) { - if (partnerId == null) { - return true; - } - - // Add, update and order fields in the struct - Types.StructType partnerStruct = findFieldType(partnerId).asStructType(); - String after = null; - for (Types.NestedField targetField : struct.fields()) { - Types.NestedField nestedField = partnerStruct.field(targetField.name()); - final String columnName; - if (nestedField != null) { - updateColumn(nestedField, targetField); - columnName = this.existingSchema.findColumnName(nestedField.fieldId()); - } else { - addColumn(partnerId, targetField); - columnName = this.targetSchema.findColumnName(targetField.fieldId()); - } - - setPosition(columnName, after); - after = columnName; - } - - // Ensure that unused fields are made optional - for (Types.NestedField existingField : partnerStruct.fields()) { - if (struct.field(existingField.name()) == null) { - if (existingField.isRequired()) { - this.api.makeColumnOptional(this.existingSchema.findColumnName(existingField.fieldId())); - } - } - } - - return false; - } - - @Override - public Boolean field(Types.NestedField field, Integer partnerId, Boolean isFieldMissing) { - return partnerId == null; - } - - @Override - public Boolean list(Types.ListType list, Integer partnerId, Boolean isElementMissing) { - if (partnerId == null) { - return true; - } - - Preconditions.checkState( - !isElementMissing, "Error traversing schemas: element is missing, but list is present"); - - Types.ListType partnerList = findFieldType(partnerId).asListType(); - updateColumn(partnerList.fields().get(0), list.fields().get(0)); - - return false; - } - - @Override - public Boolean map( - Types.MapType map, Integer partnerId, Boolean isKeyMissing, Boolean isValueMissing) { - if (partnerId == null) { - return true; - } - - Preconditions.checkState( - !isKeyMissing, "Error traversing schemas: key is missing, but map is present"); - Preconditions.checkState( - !isValueMissing, "Error traversing schemas: value is missing, but map is present"); - - Types.MapType partnerMap = findFieldType(partnerId).asMapType(); - updateColumn(partnerMap.fields().get(0), map.fields().get(0)); - updateColumn(partnerMap.fields().get(1), map.fields().get(1)); - - return false; - } - - @Override - public Boolean primitive(Type.PrimitiveType primitive, Integer partnerId) { - return partnerId == null; - } - - private Type findFieldType(int fieldId) { - if (fieldId == -1) { - return existingSchema.asStruct(); - } else { - return existingSchema.findField(fieldId).type(); - } - } - - private void addColumn(int parentId, Types.NestedField field) { - String parentName = existingSchema.findColumnName(parentId); - api.addColumn(parentName, field.name(), field.type(), field.doc()); - } - - private void updateColumn(Types.NestedField existingField, Types.NestedField targetField) { - String existingColumnName = this.existingSchema.findColumnName(existingField.fieldId()); - - boolean needsOptionalUpdate = targetField.isOptional() && existingField.isRequired(); - boolean needsTypeUpdate = - targetField.type().isPrimitiveType() && !targetField.type().equals(existingField.type()); - boolean needsDocUpdate = - targetField.doc() != null && !targetField.doc().equals(existingField.doc()); - - if (needsOptionalUpdate) { - api.makeColumnOptional(existingColumnName); - } - - if (needsTypeUpdate) { - api.updateColumn(existingColumnName, targetField.type().asPrimitiveType()); - } - - if (needsDocUpdate) { - api.updateColumnDoc(existingColumnName, targetField.doc()); - } - } - - private void setPosition(String columnName, String after) { - if (after == null) { - this.api.moveFirst(columnName); - } else { - this.api.moveAfter(columnName, after); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java deleted file mode 100644 index 91aa4a91710c..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java +++ /dev/null @@ -1,382 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; - -import java.util.Collections; -import java.util.Map; -import java.util.Objects; -import java.util.Set; -import java.util.stream.Collectors; -import javax.annotation.Nullable; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.runtime.state.KeyGroupRangeAssignment; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.PartitionField; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.sink.EqualityFieldKeySelector; -import org.apache.iceberg.flink.sink.PartitionKeySelector; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * The HashKeyGenerator is responsible for creating the appropriate hash key for Flink's keyBy - * operation. The hash key is generated depending on the user-provided DynamicRecord and the table - * metadata. Under the hood, we maintain a set of Flink {@link KeySelector}s which implement the - * appropriate Iceberg {@link DistributionMode}. For every table, we randomly select a consistent - * subset of writer subtasks which receive data via their associated keys, depending on the chosen - * DistributionMode. - * - *

    Caching ensures that a new key selector is also created when the table metadata (e.g. schema, - * spec) or the user-provided metadata changes (e.g. distribution mode, write parallelism). - * - *

    Note: The hashing must be deterministic given the same parameters of the KeySelector and the - * same provided values. - */ -class HashKeyGenerator { - private static final Logger LOG = LoggerFactory.getLogger(HashKeyGenerator.class); - - private final int maxWriteParallelism; - private final Map> keySelectorCache; - - HashKeyGenerator(int maxCacheSize, int maxWriteParallelism) { - this.maxWriteParallelism = maxWriteParallelism; - this.keySelectorCache = new LRUCache<>(maxCacheSize); - } - - int generateKey(DynamicRecord dynamicRecord) throws Exception { - return generateKey(dynamicRecord, null, null, null); - } - - int generateKey( - DynamicRecord dynamicRecord, - @Nullable Schema tableSchema, - @Nullable PartitionSpec tableSpec, - @Nullable RowData overrideRowData) { - String tableIdent = dynamicRecord.tableIdentifier().toString(); - SelectorKey cacheKey = - new SelectorKey( - tableIdent, - dynamicRecord.branch(), - tableSchema != null ? tableSchema.schemaId() : null, - tableSpec != null ? tableSpec.specId() : null, - dynamicRecord.schema(), - dynamicRecord.spec(), - dynamicRecord.equalityFields()); - KeySelector keySelector = - keySelectorCache.computeIfAbsent( - cacheKey, - k -> - getKeySelector( - tableIdent, - MoreObjects.firstNonNull(tableSchema, dynamicRecord.schema()), - MoreObjects.firstNonNull(tableSpec, dynamicRecord.spec()), - MoreObjects.firstNonNull( - dynamicRecord.distributionMode(), DistributionMode.NONE), - MoreObjects.firstNonNull( - dynamicRecord.equalityFields(), Collections.emptySet()), - dynamicRecord.writeParallelism())); - try { - return keySelector.getKey( - overrideRowData != null ? overrideRowData : dynamicRecord.rowData()); - } catch (Exception e) { - throw new RuntimeException(e); - } - } - - private KeySelector getKeySelector( - String tableName, - Schema schema, - PartitionSpec spec, - DistributionMode mode, - Set equalityFields, - int writeParallelism) { - LOG.debug( - "Creating new KeySelector for table '{}' with distribution mode '{}'", tableName, mode); - switch (mode) { - case NONE: - if (equalityFields.isEmpty()) { - return tableKeySelector(tableName, writeParallelism, maxWriteParallelism); - } else { - LOG.info( - "{}: Distribute rows by equality fields, because there are equality fields set", - tableName); - return equalityFieldKeySelector( - tableName, schema, equalityFields, writeParallelism, maxWriteParallelism); - } - - case HASH: - if (equalityFields.isEmpty()) { - if (spec.isUnpartitioned()) { - LOG.warn( - "{}: Fallback to use 'none' distribution mode, because there are no equality fields set " - + "and table is unpartitioned", - tableName); - return tableKeySelector(tableName, writeParallelism, maxWriteParallelism); - } else { - return partitionKeySelector( - tableName, schema, spec, writeParallelism, maxWriteParallelism); - } - } else { - if (spec.isUnpartitioned()) { - LOG.info( - "{}: Distribute rows by equality fields, because there are equality fields set " - + "and table is unpartitioned", - tableName); - return equalityFieldKeySelector( - tableName, schema, equalityFields, writeParallelism, maxWriteParallelism); - } else { - for (PartitionField partitionField : spec.fields()) { - Preconditions.checkState( - equalityFields.contains(partitionField.name()), - "%s: In 'hash' distribution mode with equality fields set, partition field '%s' " - + "should be included in equality fields: '%s'", - tableName, - partitionField, - schema.columns().stream() - .filter(c -> equalityFields.contains(c.name())) - .collect(Collectors.toList())); - } - return partitionKeySelector( - tableName, schema, spec, writeParallelism, maxWriteParallelism); - } - } - - case RANGE: - if (schema.identifierFieldIds().isEmpty()) { - LOG.warn( - "{}: Fallback to use 'none' distribution mode, because there are no equality fields set " - + "and {}='range' is not supported yet in flink", - tableName, - WRITE_DISTRIBUTION_MODE); - return tableKeySelector(tableName, writeParallelism, maxWriteParallelism); - } else { - LOG.info( - "{}: Distribute rows by equality fields, because there are equality fields set " - + "and {}='range' is not supported yet in flink", - tableName, - WRITE_DISTRIBUTION_MODE); - return equalityFieldKeySelector( - tableName, schema, equalityFields, writeParallelism, maxWriteParallelism); - } - - default: - throw new IllegalArgumentException( - tableName + ": Unrecognized " + WRITE_DISTRIBUTION_MODE + ": " + mode); - } - } - - private static KeySelector equalityFieldKeySelector( - String tableName, - Schema schema, - Set equalityFields, - int writeParallelism, - int maxWriteParallelism) { - return new TargetLimitedKeySelector( - new EqualityFieldKeySelector( - schema, - FlinkSchemaUtil.convert(schema), - DynamicSinkUtil.getEqualityFieldIds(equalityFields, schema)), - tableName, - writeParallelism, - maxWriteParallelism); - } - - private static KeySelector partitionKeySelector( - String tableName, - Schema schema, - PartitionSpec spec, - int writeParallelism, - int maxWriteParallelism) { - KeySelector inner = - new PartitionKeySelector(spec, schema, FlinkSchemaUtil.convert(schema)); - return new TargetLimitedKeySelector( - in -> inner.getKey(in).hashCode(), tableName, writeParallelism, maxWriteParallelism); - } - - private static KeySelector tableKeySelector( - String tableName, int writeParallelism, int maxWriteParallelism) { - return new TargetLimitedKeySelector( - new RoundRobinKeySelector<>(writeParallelism), - tableName, - writeParallelism, - maxWriteParallelism); - } - - /** - * Generates a new key using the salt as a base, and reduces the target key range of the {@link - * #wrapped} {@link KeySelector} to {@link #writeParallelism}. - */ - private static class TargetLimitedKeySelector implements KeySelector { - private final KeySelector wrapped; - private final int writeParallelism; - private final int[] distinctKeys; - - @SuppressWarnings("checkstyle:ParameterAssignment") - TargetLimitedKeySelector( - KeySelector wrapped, - String tableName, - int writeParallelism, - int maxWriteParallelism) { - if (writeParallelism > maxWriteParallelism) { - LOG.warn( - "{}: writeParallelism {} is greater than maxWriteParallelism {}. Capping writeParallelism at {}", - tableName, - writeParallelism, - maxWriteParallelism, - maxWriteParallelism); - writeParallelism = maxWriteParallelism; - } - this.wrapped = wrapped; - this.writeParallelism = writeParallelism; - this.distinctKeys = new int[writeParallelism]; - - // Ensures that the generated keys are always result in unique slotId - Set targetSlots = Sets.newHashSetWithExpectedSize(writeParallelism); - int nextKey = tableName.hashCode(); - for (int i = 0; i < writeParallelism; ++i) { - int subtaskId = subtaskId(nextKey, writeParallelism, maxWriteParallelism); - while (targetSlots.contains(subtaskId)) { - ++nextKey; - subtaskId = subtaskId(nextKey, writeParallelism, maxWriteParallelism); - } - - targetSlots.add(subtaskId); - distinctKeys[i] = nextKey; - ++nextKey; - } - } - - @Override - public Integer getKey(RowData value) throws Exception { - return distinctKeys[ - DynamicSinkUtil.safeAbs(wrapped.getKey(value).hashCode()) % writeParallelism]; - } - - private static int subtaskId(int key, int writeParallelism, int maxWriteParallelism) { - return KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup( - maxWriteParallelism, - writeParallelism, - KeyGroupRangeAssignment.computeKeyGroupForKeyHash(key, maxWriteParallelism)); - } - } - - /** - * Generates evenly distributed keys between [0..{@link #maxTarget}) range using round-robin - * algorithm. - * - * @param unused input for key generation - */ - private static class RoundRobinKeySelector implements KeySelector { - private final int maxTarget; - private int lastTarget = 0; - - RoundRobinKeySelector(int maxTarget) { - this.maxTarget = maxTarget; - } - - @Override - public Integer getKey(T value) { - lastTarget = (lastTarget + 1) % maxTarget; - return lastTarget; - } - } - - /** - * Cache key for the {@link KeySelector}. Only contains the {@link Schema} and the {@link - * PartitionSpec} if their ids are not provided. - */ - static class SelectorKey { - private final String tableName; - private final String branch; - private final Integer schemaId; - private final Integer specId; - private final Schema schema; - private final PartitionSpec spec; - private final Set equalityFields; - - SelectorKey( - String tableName, - String branch, - @Nullable Integer tableSchemaId, - @Nullable Integer tableSpecId, - Schema schema, - PartitionSpec spec, - Set equalityFields) { - this.tableName = tableName; - this.branch = branch; - this.schemaId = tableSchemaId; - this.specId = tableSpecId; - this.schema = tableSchemaId == null ? schema : null; - this.spec = tableSpecId == null ? spec : null; - this.equalityFields = equalityFields; - } - - @Override - public boolean equals(Object other) { - if (this == other) { - return true; - } - - if (other == null || getClass() != other.getClass()) { - return false; - } - - SelectorKey that = (SelectorKey) other; - return Objects.equals(tableName, that.tableName) - && Objects.equals(branch, that.branch) - && Objects.equals(schemaId, that.schemaId) - && Objects.equals(specId, that.specId) - && Objects.equals(schema, that.schema) - && Objects.equals(spec, that.spec) - && Objects.equals(equalityFields, that.equalityFields); - } - - @Override - public int hashCode() { - return Objects.hash(tableName, branch, schemaId, specId, schema, spec, equalityFields); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("tableName", tableName) - .add("branch", branch) - .add("schemaId", schemaId) - .add("specId", specId) - .add("schema", schema) - .add("spec", spec) - .add("equalityFields", equalityFields) - .toString(); - } - } - - @VisibleForTesting - Map> getKeySelectorCache() { - return keySelectorCache; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/LRUCache.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/LRUCache.java deleted file mode 100644 index be2866dc4e19..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/LRUCache.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.util.LinkedHashMap; -import java.util.Map; -import java.util.function.Consumer; - -/** - * A performant, fixed size least recently used (LRU) cache implementation. - * - *

    This cache has O(1) time complexity for get/put operations and provides eviction notifications - * when entries are removed due to size constraints. It offers better performance than similarly - * configured Caffeine caches, making it ideal for hot path operations. - * - *

    This implementation extends {@link LinkedHashMap} with access-order traversal and automated - * removal of least recently used entries when the maximum size is reached. - */ -@SuppressWarnings("checkstyle:IllegalType") -class LRUCache extends LinkedHashMap { - /** Defaults from {@link java.util.HashMap} */ - private static final int DEFAULT_INITIAL_CAPACITY = 1 << 4; - - private static final float DEFAULT_LOAD_FACTOR = 0.75f; - - private final int maximumSize; - private final Consumer> evictionCallback; - - LRUCache(int maximumSize) { - this(maximumSize, ignored -> {}); - } - - LRUCache(int maximumSize, Consumer> evictionCallback) { - super(Math.min(maximumSize, DEFAULT_INITIAL_CAPACITY), DEFAULT_LOAD_FACTOR, true); - this.maximumSize = maximumSize; - this.evictionCallback = evictionCallback; - } - - @Override - protected boolean removeEldestEntry(Map.Entry eldest) { - boolean remove = size() > maximumSize; - if (remove) { - evictionCallback.accept(eldest); - } - - return remove; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/PartitionSpecEvolution.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/PartitionSpecEvolution.java deleted file mode 100644 index 90b6c7295cb7..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/PartitionSpecEvolution.java +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.util.List; -import org.apache.iceberg.PartitionField; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.expressions.Term; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -/** Checks compatibility of PartitionSpecs and evolves one into the other. */ -public class PartitionSpecEvolution { - - private PartitionSpecEvolution() {} - - /** - * Checks whether two PartitionSpecs are compatible with each other. Less strict than {@code - * PartitionSpec#compatible} in the sense that it tolerates differently named partition fields, as - * long as their transforms and field names corresponding to their source ids match. - */ - public static boolean checkCompatibility(PartitionSpec spec1, PartitionSpec spec2) { - if (spec1.equals(spec2)) { - return true; - } - - if (spec1.fields().size() != spec2.fields().size()) { - return false; - } - - for (int i = 0; i < spec1.fields().size(); i++) { - PartitionField field1 = spec1.fields().get(i); - PartitionField field2 = spec2.fields().get(i); - if (!specFieldsAreCompatible(field1, spec1.schema(), field2, spec2.schema())) { - return false; - } - } - - return true; - } - - static PartitionSpecChanges evolve(PartitionSpec currentSpec, PartitionSpec targetSpec) { - if (currentSpec.compatibleWith(targetSpec)) { - return new PartitionSpecChanges(); - } - - PartitionSpecChanges result = new PartitionSpecChanges(); - - int maxNumFields = Math.max(currentSpec.fields().size(), targetSpec.fields().size()); - for (int i = 0; i < maxNumFields; i++) { - PartitionField currentField = Iterables.get(currentSpec.fields(), i, null); - PartitionField targetField = Iterables.get(targetSpec.fields(), i, null); - - if (!specFieldsAreCompatible( - currentField, currentSpec.schema(), targetField, targetSpec.schema())) { - - if (currentField != null) { - result.remove(toTerm(currentField, currentSpec.schema())); - } - - if (targetField != null) { - result.add(toTerm(targetField, targetSpec.schema())); - } - } - } - - return result; - } - - static class PartitionSpecChanges { - private final List termsToAdd = Lists.newArrayList(); - private final List termsToRemove = Lists.newArrayList(); - - public void add(Term term) { - termsToAdd.add(term); - } - - public void remove(Term term) { - termsToRemove.add(term); - } - - public List termsToAdd() { - return termsToAdd; - } - - public List termsToRemove() { - return termsToRemove; - } - - public boolean isEmpty() { - return termsToAdd.isEmpty() && termsToRemove.isEmpty(); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(PartitionSpecEvolution.class) - .add("termsToAdd", termsToAdd) - .add("termsToRemove", termsToRemove) - .toString(); - } - } - - private static Term toTerm(PartitionField field, Schema schema) { - String sourceName = schema.idToName().get(field.sourceId()); - return Expressions.transform(sourceName, field.transform()); - } - - private static boolean specFieldsAreCompatible( - PartitionField field1, Schema schemaField1, PartitionField field2, Schema schemaField2) { - if (field1 == null || field2 == null) { - return false; - } - String firstFieldSourceName = schemaField1.idToName().get(field1.sourceId()); - String secondFieldSourceName = schemaField2.idToName().get(field2.sourceId()); - return firstFieldSourceName.equals(secondFieldSourceName) - && field1.transform().toString().equals(field2.transform().toString()); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableMetadataCache.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableMetadataCache.java deleted file mode 100644 index 85a5a4abf29c..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableMetadataCache.java +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.util.Map; -import java.util.Set; -import org.apache.flink.annotation.Internal; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.api.java.tuple.Tuple2; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.exceptions.NoSuchTableException; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * TableMetadataCache is responsible for caching table metadata to avoid hitting the catalog too - * frequently. We store table identifier, schema, partition spec, and a set of past schema - * comparison results of the active table schema against the last input schemas. - */ -@Internal -class TableMetadataCache { - - private static final Logger LOG = LoggerFactory.getLogger(TableMetadataCache.class); - private static final Tuple2 EXISTS = Tuple2.of(true, null); - private static final Tuple2 NOT_EXISTS = Tuple2.of(false, null); - static final ResolvedSchemaInfo NOT_FOUND = - new ResolvedSchemaInfo( - null, CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED, DataConverter.identity()); - - private final Catalog catalog; - private final long refreshMs; - private final int inputSchemasPerTableCacheMaximumSize; - private final Map tableCache; - - TableMetadataCache( - Catalog catalog, int maximumSize, long refreshMs, int inputSchemasPerTableCacheMaximumSize) { - this.catalog = catalog; - this.refreshMs = refreshMs; - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaximumSize; - this.tableCache = new LRUCache<>(maximumSize); - } - - Tuple2 exists(TableIdentifier identifier) { - CacheItem cached = tableCache.get(identifier); - if (cached != null && Boolean.TRUE.equals(cached.tableExists)) { - return EXISTS; - } else if (needsRefresh(cached, true)) { - return refreshTable(identifier); - } else { - return NOT_EXISTS; - } - } - - String branch(TableIdentifier identifier, String branch) { - return branch(identifier, branch, true); - } - - ResolvedSchemaInfo schema(TableIdentifier identifier, Schema input) { - return schema(identifier, input, true); - } - - PartitionSpec spec(TableIdentifier identifier, PartitionSpec spec) { - return spec(identifier, spec, true); - } - - void update(TableIdentifier identifier, Table table) { - tableCache.put( - identifier, - new CacheItem( - true, - table.refs().keySet(), - table.schemas(), - table.specs(), - inputSchemasPerTableCacheMaximumSize)); - } - - private String branch(TableIdentifier identifier, String branch, boolean allowRefresh) { - CacheItem cached = tableCache.get(identifier); - if (cached != null && cached.tableExists && cached.branches.contains(branch)) { - return branch; - } - - if (needsRefresh(cached, allowRefresh)) { - refreshTable(identifier); - return branch(identifier, branch, false); - } else { - return null; - } - } - - private ResolvedSchemaInfo schema( - TableIdentifier identifier, Schema input, boolean allowRefresh) { - CacheItem cached = tableCache.get(identifier); - Schema compatible = null; - if (cached != null && cached.tableExists) { - // This only works if the {@link Schema#equals(Object)} returns true for the old schema - // and a new schema. Performance is paramount as this code is on the hot path. Every other - // way for comparing 2 schemas were performing worse than the - // {@link CompareByNameVisitor#visit(Schema, Schema, boolean)}, so caching was useless. - ResolvedSchemaInfo lastResult = cached.inputSchemas.get(input); - if (lastResult != null) { - return lastResult; - } - - for (Map.Entry tableSchema : cached.tableSchemas.entrySet()) { - CompareSchemasVisitor.Result result = - CompareSchemasVisitor.visit(input, tableSchema.getValue(), true); - if (result == CompareSchemasVisitor.Result.SAME) { - ResolvedSchemaInfo newResult = - new ResolvedSchemaInfo( - tableSchema.getValue(), - CompareSchemasVisitor.Result.SAME, - DataConverter.identity()); - cached.inputSchemas.put(input, newResult); - return newResult; - } else if (compatible == null - && result == CompareSchemasVisitor.Result.DATA_CONVERSION_NEEDED) { - compatible = tableSchema.getValue(); - } - } - } - - if (needsRefresh(cached, allowRefresh)) { - refreshTable(identifier); - return schema(identifier, input, false); - } else if (compatible != null) { - ResolvedSchemaInfo newResult = - new ResolvedSchemaInfo( - compatible, - CompareSchemasVisitor.Result.DATA_CONVERSION_NEEDED, - DataConverter.get( - FlinkSchemaUtil.convert(input), FlinkSchemaUtil.convert(compatible))); - cached.inputSchemas.put(input, newResult); - return newResult; - } else if (cached != null && cached.tableExists) { - cached.inputSchemas.put(input, NOT_FOUND); - return NOT_FOUND; - } else { - return NOT_FOUND; - } - } - - private PartitionSpec spec(TableIdentifier identifier, PartitionSpec spec, boolean allowRefresh) { - CacheItem cached = tableCache.get(identifier); - if (cached != null && cached.tableExists) { - for (PartitionSpec tableSpec : cached.specs.values()) { - if (PartitionSpecEvolution.checkCompatibility(tableSpec, spec)) { - return tableSpec; - } - } - } - - if (needsRefresh(cached, allowRefresh)) { - refreshTable(identifier); - return spec(identifier, spec, false); - } else { - return null; - } - } - - private Tuple2 refreshTable(TableIdentifier identifier) { - try { - Table table = catalog.loadTable(identifier); - update(identifier, table); - return EXISTS; - } catch (NoSuchTableException e) { - LOG.debug("Table doesn't exist {}", identifier, e); - tableCache.put(identifier, new CacheItem(false, null, null, null, 1)); - return Tuple2.of(false, e); - } - } - - private boolean needsRefresh(CacheItem cacheItem, boolean allowRefresh) { - return allowRefresh - && (cacheItem == null || cacheItem.created + refreshMs > System.currentTimeMillis()); - } - - public void invalidate(TableIdentifier identifier) { - tableCache.remove(identifier); - } - - /** Handles timeout for missing items only. Caffeine performance causes noticeable delays. */ - static class CacheItem { - private final long created = System.currentTimeMillis(); - - private final boolean tableExists; - private final Set branches; - private final Map tableSchemas; - private final Map specs; - private final Map inputSchemas; - - private CacheItem( - boolean tableExists, - Set branches, - Map tableSchemas, - Map specs, - int inputSchemaCacheMaximumSize) { - this.tableExists = tableExists; - this.branches = branches; - this.tableSchemas = tableSchemas; - this.specs = specs; - this.inputSchemas = - new LRUCache<>(inputSchemaCacheMaximumSize, CacheItem::inputSchemaEvictionListener); - } - - private static void inputSchemaEvictionListener( - Map.Entry evictedEntry) { - LOG.warn( - "Performance degraded as records with different schema is generated for the same table. " - + "Likely the DynamicRecord.schema is not reused. " - + "Reuse the same instance if the record schema is the same to improve performance"); - } - - @VisibleForTesting - Map inputSchemas() { - return inputSchemas; - } - } - - static class ResolvedSchemaInfo { - private final Schema resolvedTableSchema; - private final CompareSchemasVisitor.Result compareResult; - private final DataConverter recordConverter; - - ResolvedSchemaInfo( - Schema tableSchema, - CompareSchemasVisitor.Result compareResult, - DataConverter recordConverter) { - this.resolvedTableSchema = tableSchema; - this.compareResult = compareResult; - this.recordConverter = recordConverter; - } - - Schema resolvedTableSchema() { - return resolvedTableSchema; - } - - CompareSchemasVisitor.Result compareResult() { - return compareResult; - } - - DataConverter recordConverter() { - return recordConverter; - } - } - - @VisibleForTesting - Map getInternalCache() { - return tableCache; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableSerializerCache.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableSerializerCache.java deleted file mode 100644 index 84d0ed9be5d0..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableSerializerCache.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.io.Serializable; -import java.util.Map; -import javax.annotation.Nullable; -import org.apache.flink.annotation.Internal; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.api.java.tuple.Tuple3; -import org.apache.flink.table.runtime.typeutils.RowDataSerializer; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.CatalogLoader; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -/** - * A Cache which holds Flink's {@link RowDataSerializer} for a given table name and schema. This - * avoids re-creating the serializer for a given table schema for every incoming record. - * - *

    There is an additional optimization built into this class: Users do not have to supply the - * full schema / spec, but can also provide their id. This avoids transferring the schema / spec for - * every record. If the id is unknown, the schema / spec will be retrieved from the catalog. - * - *

    Note that the caller must ensure that ids are only used for known schemas / specs. The id - * optimization must not be used in the update path. - */ -@Internal -class TableSerializerCache implements Serializable { - - private final CatalogLoader catalogLoader; - private final int maximumSize; - private transient Map serializers; - - TableSerializerCache(CatalogLoader catalogLoader, int maximumSize) { - this.catalogLoader = catalogLoader; - this.maximumSize = maximumSize; - } - - RowDataSerializer serializer(String tableName, Schema schema, PartitionSpec spec) { - return serializer(tableName, schema, spec, null, null).f0; - } - - Tuple3 serializerWithSchemaAndSpec( - String tableName, Integer schemaId, Integer specId) { - return serializer(tableName, null, null, schemaId, specId); - } - - private Tuple3 serializer( - String tableName, - @Nullable Schema unknownSchema, - @Nullable PartitionSpec unknownSpec, - @Nullable Integer schemaId, - @Nullable Integer specId) { - Preconditions.checkState( - (unknownSchema == null && unknownSpec == null) ^ (schemaId == null && specId == null), - "Either the full schema/spec or their ids must be provided."); - - if (serializers == null) { - // We need to initialize the cache at the first time - this.serializers = new LRUCache<>(maximumSize); - } - - SerializerInfo info = serializers.computeIfAbsent(tableName, SerializerInfo::new); - Schema schema = unknownSchema != null ? unknownSchema : info.schemas.get(schemaId); - PartitionSpec spec = unknownSpec != null ? unknownSpec : info.specs.get(specId); - - if (schema == null || spec == null) { - info.update(); - schema = info.schemas.get(schemaId); - spec = info.specs.get(specId); - } - - RowDataSerializer serializer = - info.serializers.computeIfAbsent( - schema, s -> new RowDataSerializer(FlinkSchemaUtil.convert(s))); - - return Tuple3.of(serializer, schema, spec); - } - - CatalogLoader catalogLoader() { - return catalogLoader; - } - - int maximumSize() { - return maximumSize; - } - - private class SerializerInfo { - private final String tableName; - private final Map serializers; - private Map schemas; - private Map specs; - - SerializerInfo(String tableName) { - this.tableName = tableName; - this.serializers = Maps.newHashMapWithExpectedSize(2); - this.schemas = Maps.newHashMapWithExpectedSize(1); - this.specs = Maps.newHashMapWithExpectedSize(0); - } - - private void update() { - Table table = catalogLoader.loadCatalog().loadTable(TableIdentifier.parse(tableName)); - schemas = table.schemas(); - specs = table.specs(); - } - } - - @VisibleForTesting - Map getCache() { - return serializers; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableUpdater.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableUpdater.java deleted file mode 100644 index fdd182830b2c..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableUpdater.java +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.java.tuple.Tuple2; -import org.apache.flink.api.java.tuple.Tuple3; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.UpdatePartitionSpec; -import org.apache.iceberg.UpdateSchema; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.SupportsNamespaces; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.exceptions.AlreadyExistsException; -import org.apache.iceberg.exceptions.CommitFailedException; -import org.apache.iceberg.exceptions.NoSuchNamespaceException; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** Updates the Iceberg tables in case of schema, branch, or partition changes. */ -@Internal -class TableUpdater { - - private static final Logger LOG = LoggerFactory.getLogger(TableUpdater.class); - private final TableMetadataCache cache; - private final Catalog catalog; - - TableUpdater(TableMetadataCache cache, Catalog catalog) { - this.cache = cache; - this.catalog = catalog; - } - - /** - * Creates or updates a table to make sure that the given branch, schema, spec exists. - * - * @return a {@link Tuple3} of the new {@link Schema}, the status of the schema compared to the - * requested one, and the new {@link PartitionSpec#specId()}. - */ - Tuple2 update( - TableIdentifier tableIdentifier, String branch, Schema schema, PartitionSpec spec) { - findOrCreateTable(tableIdentifier, schema, spec); - findOrCreateBranch(tableIdentifier, branch); - TableMetadataCache.ResolvedSchemaInfo newSchemaInfo = - findOrCreateSchema(tableIdentifier, schema); - PartitionSpec newSpec = findOrCreateSpec(tableIdentifier, spec); - return Tuple2.of(newSchemaInfo, newSpec); - } - - private void findOrCreateTable(TableIdentifier identifier, Schema schema, PartitionSpec spec) { - Tuple2 exists = cache.exists(identifier); - if (Boolean.FALSE.equals(exists.f0)) { - if (exists.f1 instanceof NoSuchNamespaceException) { - SupportsNamespaces catalogWithNameSpace = (SupportsNamespaces) catalog; - LOG.info("Namespace {} not found during table search. Creating namespace", identifier); - try { - catalogWithNameSpace.createNamespace(identifier.namespace()); - } catch (AlreadyExistsException e) { - LOG.debug("Namespace {} created concurrently", identifier.namespace(), e); - } - } - - LOG.info("Table {} not found during table search. Creating table.", identifier); - try { - Table table = catalog.createTable(identifier, schema, spec); - cache.update(identifier, table); - } catch (AlreadyExistsException e) { - LOG.debug("Table {} created concurrently. Skipping creation.", identifier, e); - cache.invalidate(identifier); - findOrCreateTable(identifier, schema, spec); - } - } - } - - private void findOrCreateBranch(TableIdentifier identifier, String branch) { - String fromCache = cache.branch(identifier, branch); - if (fromCache == null) { - Table table = catalog.loadTable(identifier); - try { - table.manageSnapshots().createBranch(branch).commit(); - LOG.info("Branch {} for {} created", branch, identifier); - } catch (CommitFailedException e) { - table.refresh(); - if (table.refs().containsKey(branch)) { - LOG.debug("Branch {} concurrently created for {}.", branch, identifier); - } else { - LOG.error("Failed to create branch {} for {}.", branch, identifier, e); - throw e; - } - } - - cache.update(identifier, table); - } - } - - private TableMetadataCache.ResolvedSchemaInfo findOrCreateSchema( - TableIdentifier identifier, Schema schema) { - TableMetadataCache.ResolvedSchemaInfo fromCache = cache.schema(identifier, schema); - if (fromCache.compareResult() != CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED) { - return fromCache; - } else { - Table table = catalog.loadTable(identifier); - Schema tableSchema = table.schema(); - CompareSchemasVisitor.Result result = CompareSchemasVisitor.visit(schema, tableSchema, true); - switch (result) { - case SAME: - cache.update(identifier, table); - return new TableMetadataCache.ResolvedSchemaInfo( - tableSchema, result, DataConverter.identity()); - case DATA_CONVERSION_NEEDED: - cache.update(identifier, table); - return new TableMetadataCache.ResolvedSchemaInfo( - tableSchema, - result, - DataConverter.get( - FlinkSchemaUtil.convert(schema), FlinkSchemaUtil.convert(tableSchema))); - case SCHEMA_UPDATE_NEEDED: - LOG.info( - "Triggering schema update for table {} {} to {}", identifier, tableSchema, schema); - UpdateSchema updateApi = table.updateSchema(); - EvolveSchemaVisitor.visit(updateApi, tableSchema, schema); - - try { - updateApi.commit(); - cache.update(identifier, table); - TableMetadataCache.ResolvedSchemaInfo comparisonAfterMigration = - cache.schema(identifier, schema); - Schema newSchema = comparisonAfterMigration.resolvedTableSchema(); - LOG.info("Table {} schema updated from {} to {}", identifier, tableSchema, newSchema); - return comparisonAfterMigration; - } catch (CommitFailedException e) { - cache.invalidate(identifier); - TableMetadataCache.ResolvedSchemaInfo newSchema = cache.schema(identifier, schema); - if (newSchema.compareResult() != CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED) { - LOG.debug("Table {} schema updated concurrently to {}", identifier, schema); - return newSchema; - } else { - LOG.error( - "Schema update failed for {} from {} to {}", identifier, tableSchema, schema, e); - throw e; - } - } - default: - throw new IllegalArgumentException("Unknown comparison result"); - } - } - } - - private PartitionSpec findOrCreateSpec(TableIdentifier identifier, PartitionSpec targetSpec) { - PartitionSpec currentSpec = cache.spec(identifier, targetSpec); - if (currentSpec != null) { - return currentSpec; - } - - Table table = catalog.loadTable(identifier); - currentSpec = table.spec(); - - PartitionSpecEvolution.PartitionSpecChanges result = - PartitionSpecEvolution.evolve(currentSpec, targetSpec); - if (result.isEmpty()) { - LOG.info("Returning equivalent existing spec {} for {}", currentSpec, targetSpec); - return currentSpec; - } - - LOG.info( - "Spec for table {} has been altered. Updating from {} to {}", - identifier, - currentSpec, - targetSpec); - UpdatePartitionSpec updater = table.updateSpec(); - result.termsToRemove().forEach(updater::removeField); - result.termsToAdd().forEach(updater::addField); - - try { - updater.commit(); - cache.update(identifier, table); - } catch (CommitFailedException e) { - cache.invalidate(identifier); - PartitionSpec newSpec = cache.spec(identifier, targetSpec); - result = PartitionSpecEvolution.evolve(targetSpec, newSpec); - if (result.isEmpty()) { - LOG.debug("Table {} partition spec updated concurrently to {}", identifier, newSpec); - return newSpec; - } else { - LOG.error( - "Partition spec update failed for {} from {} to {}", - identifier, - currentSpec, - targetSpec, - e); - throw e; - } - } - return cache.spec(identifier, targetSpec); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/WriteTarget.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/WriteTarget.java deleted file mode 100644 index afd5b637e933..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/WriteTarget.java +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import java.io.IOException; -import java.io.Serializable; -import java.util.Objects; -import java.util.Set; -import org.apache.flink.core.memory.DataInputView; -import org.apache.flink.core.memory.DataOutputView; -import org.apache.hadoop.util.Sets; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; - -class WriteTarget implements Serializable { - - private final String tableName; - private final String branch; - private final Integer schemaId; - private final Integer specId; - private final boolean upsertMode; - private final Set equalityFields; - - WriteTarget( - String tableName, - String branch, - Integer schemaId, - Integer specId, - boolean upsertMode, - Set equalityFields) { - this.tableName = tableName; - this.branch = branch != null ? branch : "main"; - this.schemaId = schemaId; - this.specId = specId; - this.upsertMode = upsertMode; - this.equalityFields = equalityFields; - } - - String tableName() { - return tableName; - } - - String branch() { - return branch; - } - - Integer schemaId() { - return schemaId; - } - - Integer specId() { - return specId; - } - - boolean upsertMode() { - return upsertMode; - } - - Set equalityFields() { - return equalityFields; - } - - void serializeTo(DataOutputView view) throws IOException { - view.writeUTF(tableName); - view.writeUTF(branch); - view.writeInt(schemaId); - view.writeInt(specId); - view.writeBoolean(upsertMode); - view.writeInt(equalityFields.size()); - for (Integer equalityField : equalityFields) { - view.writeInt(equalityField); - } - } - - static WriteTarget deserializeFrom(DataInputView view) throws IOException { - return new WriteTarget( - view.readUTF(), - view.readUTF(), - view.readInt(), - view.readInt(), - view.readBoolean(), - readSet(view)); - } - - private static Set readSet(DataInputView view) throws IOException { - int numFields = view.readInt(); - Set equalityFields = Sets.newHashSetWithExpectedSize(numFields); - for (int i = 0; i < numFields; i++) { - equalityFields.add(view.readInt()); - } - - return equalityFields; - } - - @Override - public boolean equals(Object other) { - if (this == other) { - return true; - } - - if (other == null || getClass() != other.getClass()) { - return false; - } - - WriteTarget that = (WriteTarget) other; - return Objects.equals(tableName, that.tableName) - && Objects.equals(branch, that.branch) - && Objects.equals(schemaId, that.schemaId) - && Objects.equals(specId, that.specId) - && upsertMode == that.upsertMode - && Objects.equals(equalityFields, that.equalityFields); - } - - @Override - public int hashCode() { - return Objects.hash(tableName, branch, schemaId, specId, upsertMode, equalityFields); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("tableName", tableName) - .add("branch", branch) - .add("schemaId", schemaId) - .add("specId", specId) - .add("upsertMode", upsertMode) - .toString(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java deleted file mode 100644 index 95c2328f032a..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Map; -import java.util.NavigableMap; -import java.util.Set; -import javax.annotation.Nullable; -import org.apache.datasketches.sampling.ReservoirItemsSketch; -import org.apache.datasketches.sampling.ReservoirItemsUnion; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * AggregatedStatisticsTracker tracks the statistics aggregation received from {@link - * DataStatisticsOperator} subtasks for every checkpoint. - */ -class AggregatedStatisticsTracker { - private static final Logger LOG = LoggerFactory.getLogger(AggregatedStatisticsTracker.class); - - private final String operatorName; - private final int parallelism; - private final TypeSerializer statisticsSerializer; - private final int downstreamParallelism; - private final StatisticsType statisticsType; - private final int switchToSketchThreshold; - private final NavigableMap aggregationsPerCheckpoint; - - private CompletedStatistics completedStatistics; - - AggregatedStatisticsTracker( - String operatorName, - int parallelism, - Schema schema, - SortOrder sortOrder, - int downstreamParallelism, - StatisticsType statisticsType, - int switchToSketchThreshold, - @Nullable CompletedStatistics restoredStatistics) { - this.operatorName = operatorName; - this.parallelism = parallelism; - this.statisticsSerializer = - new DataStatisticsSerializer(new SortKeySerializer(schema, sortOrder)); - this.downstreamParallelism = downstreamParallelism; - this.statisticsType = statisticsType; - this.switchToSketchThreshold = switchToSketchThreshold; - this.completedStatistics = restoredStatistics; - - this.aggregationsPerCheckpoint = Maps.newTreeMap(); - } - - CompletedStatistics updateAndCheckCompletion(int subtask, StatisticsEvent event) { - long checkpointId = event.checkpointId(); - LOG.debug( - "Handling statistics event from subtask {} of operator {} for checkpoint {}", - subtask, - operatorName, - checkpointId); - - if (completedStatistics != null && completedStatistics.checkpointId() > checkpointId) { - LOG.info( - "Ignore stale statistics event from operator {} subtask {} for older checkpoint {}. " - + "Was expecting data statistics from checkpoint higher than {}", - operatorName, - subtask, - checkpointId, - completedStatistics.checkpointId()); - return null; - } - - Aggregation aggregation = - aggregationsPerCheckpoint.computeIfAbsent( - checkpointId, - ignored -> - new Aggregation( - parallelism, - downstreamParallelism, - switchToSketchThreshold, - statisticsType, - StatisticsUtil.collectType(statisticsType, completedStatistics))); - DataStatistics dataStatistics = - StatisticsUtil.deserializeDataStatistics(event.statisticsBytes(), statisticsSerializer); - if (!aggregation.merge(subtask, dataStatistics)) { - LOG.debug( - "Ignore duplicate data statistics from operator {} subtask {} for checkpoint {}.", - operatorName, - subtask, - checkpointId); - } - - if (aggregation.isComplete()) { - this.completedStatistics = aggregation.completedStatistics(checkpointId); - // clean up aggregations up to the completed checkpoint id - aggregationsPerCheckpoint.headMap(checkpointId, true).clear(); - return completedStatistics; - } - - return null; - } - - @VisibleForTesting - NavigableMap aggregationsPerCheckpoint() { - return aggregationsPerCheckpoint; - } - - static class Aggregation { - private static final Logger LOG = LoggerFactory.getLogger(Aggregation.class); - - private final Set subtaskSet; - private final int parallelism; - private final int downstreamParallelism; - private final int switchToSketchThreshold; - private final StatisticsType configuredType; - private StatisticsType currentType; - private Map mapStatistics; - private ReservoirItemsUnion sketchStatistics; - - Aggregation( - int parallelism, - int downstreamParallelism, - int switchToSketchThreshold, - StatisticsType configuredType, - StatisticsType currentType) { - this.subtaskSet = Sets.newHashSet(); - this.parallelism = parallelism; - this.downstreamParallelism = downstreamParallelism; - this.switchToSketchThreshold = switchToSketchThreshold; - this.configuredType = configuredType; - this.currentType = currentType; - - if (currentType == StatisticsType.Map) { - this.mapStatistics = Maps.newHashMap(); - this.sketchStatistics = null; - } else { - this.mapStatistics = null; - this.sketchStatistics = - ReservoirItemsUnion.newInstance( - SketchUtil.determineCoordinatorReservoirSize(downstreamParallelism)); - } - } - - @VisibleForTesting - Set subtaskSet() { - return subtaskSet; - } - - @VisibleForTesting - StatisticsType currentType() { - return currentType; - } - - @VisibleForTesting - Map mapStatistics() { - return mapStatistics; - } - - @VisibleForTesting - ReservoirItemsUnion sketchStatistics() { - return sketchStatistics; - } - - private boolean isComplete() { - return subtaskSet.size() == parallelism; - } - - /** - * @return false if duplicate - */ - private boolean merge(int subtask, DataStatistics taskStatistics) { - if (subtaskSet.contains(subtask)) { - return false; - } - - subtaskSet.add(subtask); - merge(taskStatistics); - return true; - } - - @SuppressWarnings("unchecked") - private void merge(DataStatistics taskStatistics) { - if (taskStatistics.type() == StatisticsType.Map) { - Map taskMapStats = (Map) taskStatistics.result(); - if (currentType == StatisticsType.Map) { - taskMapStats.forEach((key, count) -> mapStatistics.merge(key, count, Long::sum)); - if (configuredType == StatisticsType.Auto - && mapStatistics.size() > switchToSketchThreshold) { - convertCoordinatorToSketch(); - } - } else { - // convert task stats to sketch first - ReservoirItemsSketch taskSketch = - ReservoirItemsSketch.newInstance( - SketchUtil.determineOperatorReservoirSize(parallelism, downstreamParallelism)); - SketchUtil.convertMapToSketch(taskMapStats, taskSketch::update); - sketchStatistics.update(taskSketch); - } - } else { - ReservoirItemsSketch taskSketch = - (ReservoirItemsSketch) taskStatistics.result(); - if (currentType == StatisticsType.Map) { - // convert global stats to sketch first - convertCoordinatorToSketch(); - } - - if (taskSketch.getNumSamples() > 0) { - sketchStatistics.update(taskSketch); - } - } - } - - private void convertCoordinatorToSketch() { - this.sketchStatistics = - ReservoirItemsUnion.newInstance( - SketchUtil.determineCoordinatorReservoirSize(downstreamParallelism)); - SketchUtil.convertMapToSketch(mapStatistics, sketchStatistics::update); - this.currentType = StatisticsType.Sketch; - this.mapStatistics = null; - } - - private CompletedStatistics completedStatistics(long checkpointId) { - if (currentType == StatisticsType.Map) { - LOG.info("Completed map statistics aggregation with {} keys", mapStatistics.size()); - return CompletedStatistics.fromKeyFrequency(checkpointId, mapStatistics); - } else { - ReservoirItemsSketch sketch = sketchStatistics.getResult(); - if (sketch != null) { - LOG.info( - "Completed sketch statistics aggregation: " - + "reservoir size = {}, number of items seen = {}, number of samples = {}", - sketch.getK(), - sketch.getN(), - sketch.getNumSamples()); - return CompletedStatistics.fromKeySamples(checkpointId, sketch.getSamples()); - } else { - LOG.info("Empty sketch statistics."); - return CompletedStatistics.fromKeySamples(checkpointId, new SortKey[0]); - } - } - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java deleted file mode 100644 index a8bf0f839e49..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Arrays; -import java.util.Map; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Objects; - -/** - * This is what {@link AggregatedStatisticsTracker} returns upon a completed statistics aggregation - * from all subtasks. It contains the raw statistics (Map or reservoir samples). - */ -class CompletedStatistics { - private final long checkpointId; - private final StatisticsType type; - private final Map keyFrequency; - private final SortKey[] keySamples; - - static CompletedStatistics fromKeyFrequency(long checkpointId, Map stats) { - return new CompletedStatistics(checkpointId, StatisticsType.Map, stats, null); - } - - static CompletedStatistics fromKeySamples(long checkpointId, SortKey[] keySamples) { - return new CompletedStatistics(checkpointId, StatisticsType.Sketch, null, keySamples); - } - - CompletedStatistics( - long checkpointId, - StatisticsType type, - Map keyFrequency, - SortKey[] keySamples) { - this.checkpointId = checkpointId; - this.type = type; - this.keyFrequency = keyFrequency; - this.keySamples = keySamples; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("checkpointId", checkpointId) - .add("type", type) - .add("keyFrequency", keyFrequency) - .add("keySamples", keySamples) - .toString(); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (!(o instanceof CompletedStatistics)) { - return false; - } - - CompletedStatistics other = (CompletedStatistics) o; - return Objects.equal(checkpointId, other.checkpointId) - && Objects.equal(type, other.type) - && Objects.equal(keyFrequency, other.keyFrequency()) - && Arrays.equals(keySamples, other.keySamples()); - } - - @Override - public int hashCode() { - return Objects.hashCode(checkpointId, type, keyFrequency, keySamples); - } - - long checkpointId() { - return checkpointId; - } - - StatisticsType type() { - return type; - } - - Map keyFrequency() { - return keyFrequency; - } - - SortKey[] keySamples() { - return keySamples; - } - - boolean isEmpty() { - if (type == StatisticsType.Sketch) { - return keySamples.length == 0; - } else { - return keyFrequency().isEmpty(); - } - } - - boolean isValid() { - if (type == StatisticsType.Sketch) { - if (null == keySamples) { - return false; - } - } else { - if (null == keyFrequency()) { - return false; - } - if (keyFrequency().values().contains(null)) { - return false; - } - } - - return true; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java deleted file mode 100644 index 48c85a9bd91e..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; -import org.apache.flink.api.common.typeutils.base.EnumSerializer; -import org.apache.flink.api.common.typeutils.base.ListSerializer; -import org.apache.flink.api.common.typeutils.base.LongSerializer; -import org.apache.flink.api.common.typeutils.base.MapSerializer; -import org.apache.flink.core.memory.DataInputView; -import org.apache.flink.core.memory.DataOutputView; -import org.apache.iceberg.SortKey; - -class CompletedStatisticsSerializer extends TypeSerializer { - private final TypeSerializer sortKeySerializer; - private final EnumSerializer statisticsTypeSerializer; - private final MapSerializer keyFrequencySerializer; - private final ListSerializer keySamplesSerializer; - - CompletedStatisticsSerializer(TypeSerializer sortKeySerializer) { - this.sortKeySerializer = sortKeySerializer; - this.statisticsTypeSerializer = new EnumSerializer<>(StatisticsType.class); - this.keyFrequencySerializer = new MapSerializer<>(sortKeySerializer, LongSerializer.INSTANCE); - this.keySamplesSerializer = new ListSerializer<>(sortKeySerializer); - } - - public void changeSortKeySerializerVersion(int version) { - if (sortKeySerializer instanceof SortKeySerializer) { - ((SortKeySerializer) sortKeySerializer).setVersion(version); - } - } - - public void changeSortKeySerializerVersionLatest() { - if (sortKeySerializer instanceof SortKeySerializer) { - ((SortKeySerializer) sortKeySerializer).restoreToLatestVersion(); - } - } - - @Override - public boolean isImmutableType() { - return false; - } - - @Override - public TypeSerializer duplicate() { - return new CompletedStatisticsSerializer(sortKeySerializer); - } - - @Override - public CompletedStatistics createInstance() { - return CompletedStatistics.fromKeyFrequency(0L, Collections.emptyMap()); - } - - @Override - public CompletedStatistics copy(CompletedStatistics from) { - return new CompletedStatistics( - from.checkpointId(), from.type(), from.keyFrequency(), from.keySamples()); - } - - @Override - public CompletedStatistics copy(CompletedStatistics from, CompletedStatistics reuse) { - // no benefit of reuse - return copy(from); - } - - @Override - public int getLength() { - return -1; - } - - @Override - public void serialize(CompletedStatistics record, DataOutputView target) throws IOException { - target.writeLong(record.checkpointId()); - statisticsTypeSerializer.serialize(record.type(), target); - if (record.type() == StatisticsType.Map) { - keyFrequencySerializer.serialize(record.keyFrequency(), target); - } else { - keySamplesSerializer.serialize(Arrays.asList(record.keySamples()), target); - } - } - - @Override - public CompletedStatistics deserialize(DataInputView source) throws IOException { - long checkpointId = source.readLong(); - StatisticsType type = statisticsTypeSerializer.deserialize(source); - if (type == StatisticsType.Map) { - Map keyFrequency = keyFrequencySerializer.deserialize(source); - return CompletedStatistics.fromKeyFrequency(checkpointId, keyFrequency); - } else { - List sortKeys = keySamplesSerializer.deserialize(source); - SortKey[] keySamples = new SortKey[sortKeys.size()]; - keySamples = sortKeys.toArray(keySamples); - return CompletedStatistics.fromKeySamples(checkpointId, keySamples); - } - } - - @Override - public CompletedStatistics deserialize(CompletedStatistics reuse, DataInputView source) - throws IOException { - // not much benefit to reuse - return deserialize(source); - } - - @Override - public void copy(DataInputView source, DataOutputView target) throws IOException { - serialize(deserialize(source), target); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - - if (obj == null || getClass() != obj.getClass()) { - return false; - } - - CompletedStatisticsSerializer other = (CompletedStatisticsSerializer) obj; - return Objects.equals(sortKeySerializer, other.sortKeySerializer); - } - - @Override - public int hashCode() { - return sortKeySerializer.hashCode(); - } - - @Override - public TypeSerializerSnapshot snapshotConfiguration() { - return new CompletedStatisticsSerializerSnapshot(this); - } - - public static class CompletedStatisticsSerializerSnapshot - extends CompositeTypeSerializerSnapshot { - private static final int CURRENT_VERSION = 1; - - /** Constructor for read instantiation. */ - @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) - public CompletedStatisticsSerializerSnapshot() {} - - @SuppressWarnings("checkstyle:RedundantModifier") - public CompletedStatisticsSerializerSnapshot(CompletedStatisticsSerializer serializer) { - super(serializer); - } - - @Override - protected int getCurrentOuterSnapshotVersion() { - return CURRENT_VERSION; - } - - @Override - protected TypeSerializer[] getNestedSerializers( - CompletedStatisticsSerializer outerSerializer) { - return new TypeSerializer[] {outerSerializer.sortKeySerializer}; - } - - @Override - protected CompletedStatisticsSerializer createOuterSerializerWithNestedSerializers( - TypeSerializer[] nestedSerializers) { - SortKeySerializer sortKeySerializer = (SortKeySerializer) nestedSerializers[0]; - return new CompletedStatisticsSerializer(sortKeySerializer); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java deleted file mode 100644 index 76c59cd5f4b8..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Map; -import org.apache.datasketches.sampling.ReservoirItemsSketch; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.SortKey; - -/** - * DataStatistics defines the interface to collect data distribution information. - * - *

    Data statistics tracks traffic volume distribution across data keys. For low-cardinality key, - * a simple map of (key, count) can be used. For high-cardinality key, probabilistic data structures - * (sketching) can be used. - */ -@Internal -interface DataStatistics { - - StatisticsType type(); - - boolean isEmpty(); - - /** Add row sortKey to data statistics. */ - void add(SortKey sortKey); - - /** - * Get the collected statistics. Could be a {@link Map} (low cardinality) or {@link - * ReservoirItemsSketch} (high cardinality) - */ - Object result(); -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java deleted file mode 100644 index 773d0fe6c65a..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java +++ /dev/null @@ -1,536 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Comparator; -import java.util.Locale; -import java.util.Map; -import java.util.concurrent.Callable; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.ThreadFactory; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; -import org.apache.flink.runtime.operators.coordination.OperatorEvent; -import org.apache.flink.util.ExceptionUtils; -import org.apache.flink.util.FatalExitExceptionHandler; -import org.apache.flink.util.FlinkRuntimeException; -import org.apache.flink.util.ThrowableCatchingRunnable; -import org.apache.flink.util.function.ThrowingRunnable; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Comparators; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * DataStatisticsCoordinator receives {@link StatisticsEvent} from {@link DataStatisticsOperator} - * every subtask and then merge them together. Once aggregation for all subtasks data statistics - * completes, DataStatisticsCoordinator will send the aggregated data statistics back to {@link - * DataStatisticsOperator}. In the end a custom partitioner will distribute traffic based on the - * aggregated data statistics to improve data clustering. - */ -@Internal -class DataStatisticsCoordinator implements OperatorCoordinator { - private static final Logger LOG = LoggerFactory.getLogger(DataStatisticsCoordinator.class); - - private final String operatorName; - private final OperatorCoordinator.Context context; - private final Schema schema; - private final SortOrder sortOrder; - private final Comparator comparator; - private final int downstreamParallelism; - private final StatisticsType statisticsType; - private final double closeFileCostWeightPercentage; - - private final ExecutorService coordinatorExecutor; - private final SubtaskGateways subtaskGateways; - private final CoordinatorExecutorThreadFactory coordinatorThreadFactory; - private final TypeSerializer completedStatisticsSerializer; - private final TypeSerializer globalStatisticsSerializer; - - private transient boolean started; - private transient AggregatedStatisticsTracker aggregatedStatisticsTracker; - private transient CompletedStatistics completedStatistics; - private transient GlobalStatistics globalStatistics; - - DataStatisticsCoordinator( - String operatorName, - OperatorCoordinator.Context context, - Schema schema, - SortOrder sortOrder, - int downstreamParallelism, - StatisticsType statisticsType, - double closeFileCostWeightPercentage) { - this.operatorName = operatorName; - this.context = context; - this.schema = schema; - this.sortOrder = sortOrder; - this.comparator = Comparators.forType(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()); - this.downstreamParallelism = downstreamParallelism; - this.statisticsType = statisticsType; - this.closeFileCostWeightPercentage = closeFileCostWeightPercentage; - - this.coordinatorThreadFactory = - new CoordinatorExecutorThreadFactory( - "DataStatisticsCoordinator-" + operatorName, context.getUserCodeClassloader()); - this.coordinatorExecutor = Executors.newSingleThreadExecutor(coordinatorThreadFactory); - this.subtaskGateways = new SubtaskGateways(operatorName, context.currentParallelism()); - SortKeySerializer sortKeySerializer = new SortKeySerializer(schema, sortOrder); - this.completedStatisticsSerializer = new CompletedStatisticsSerializer(sortKeySerializer); - this.globalStatisticsSerializer = new GlobalStatisticsSerializer(sortKeySerializer); - } - - @Override - public void start() throws Exception { - LOG.info("Starting data statistics coordinator: {}.", operatorName); - this.started = true; - - // statistics are restored already in resetToCheckpoint() before start() called - this.aggregatedStatisticsTracker = - new AggregatedStatisticsTracker( - operatorName, - context.currentParallelism(), - schema, - sortOrder, - downstreamParallelism, - statisticsType, - SketchUtil.COORDINATOR_SKETCH_SWITCH_THRESHOLD, - completedStatistics); - } - - @Override - public void close() throws Exception { - coordinatorExecutor.shutdown(); - this.aggregatedStatisticsTracker = null; - this.started = false; - LOG.info("Closed data statistics coordinator: {}.", operatorName); - } - - @VisibleForTesting - void callInCoordinatorThread(Callable callable, String errorMessage) { - ensureStarted(); - // Ensure the task is done by the coordinator executor. - if (!coordinatorThreadFactory.isCurrentThreadCoordinatorThread()) { - try { - Callable guardedCallable = - () -> { - try { - return callable.call(); - } catch (Throwable t) { - LOG.error( - "Uncaught Exception in data statistics coordinator: {} executor", - operatorName, - t); - ExceptionUtils.rethrowException(t); - return null; - } - }; - - coordinatorExecutor.submit(guardedCallable).get(); - } catch (InterruptedException | ExecutionException e) { - throw new FlinkRuntimeException(errorMessage, e); - } - } else { - try { - callable.call(); - } catch (Throwable t) { - LOG.error( - "Uncaught Exception in data statistics coordinator: {} executor", operatorName, t); - throw new FlinkRuntimeException(errorMessage, t); - } - } - } - - public void runInCoordinatorThread(Runnable runnable) { - this.coordinatorExecutor.execute( - new ThrowableCatchingRunnable( - throwable -> - this.coordinatorThreadFactory.uncaughtException(Thread.currentThread(), throwable), - runnable)); - } - - private void runInCoordinatorThread(ThrowingRunnable action, String actionString) { - ensureStarted(); - runInCoordinatorThread( - () -> { - try { - action.run(); - } catch (Throwable t) { - ExceptionUtils.rethrowIfFatalErrorOrOOM(t); - LOG.error( - "Uncaught exception in the data statistics coordinator: {} while {}. Triggering job failover", - operatorName, - actionString, - t); - context.failJob(t); - } - }); - } - - private void ensureStarted() { - Preconditions.checkState(started, "The coordinator of %s has not started yet.", operatorName); - } - - private void handleDataStatisticRequest(int subtask, StatisticsEvent event) { - CompletedStatistics maybeCompletedStatistics = - aggregatedStatisticsTracker.updateAndCheckCompletion(subtask, event); - - if (maybeCompletedStatistics != null) { - if (maybeCompletedStatistics.isEmpty()) { - LOG.info( - "Skip aggregated statistics for checkpoint {} as it is empty.", event.checkpointId()); - } else { - LOG.info("Completed statistics aggregation for checkpoint {}", event.checkpointId()); - // completedStatistics contains the complete samples, which is needed to compute - // the range bounds in globalStatistics if downstreamParallelism changed. - this.completedStatistics = maybeCompletedStatistics; - // globalStatistics only contains assignment calculated based on Map or Sketch statistics - this.globalStatistics = - globalStatistics( - maybeCompletedStatistics, - downstreamParallelism, - comparator, - closeFileCostWeightPercentage); - sendGlobalStatisticsToSubtasks(globalStatistics); - } - } - } - - private static GlobalStatistics globalStatistics( - CompletedStatistics completedStatistics, - int downstreamParallelism, - Comparator comparator, - double closeFileCostWeightPercentage) { - if (completedStatistics.type() == StatisticsType.Sketch) { - // range bound is a much smaller array compared to the complete samples. - // It helps reduce the amount of data transfer from coordinator to operator subtasks. - return GlobalStatistics.fromRangeBounds( - completedStatistics.checkpointId(), - SketchUtil.rangeBounds( - downstreamParallelism, comparator, completedStatistics.keySamples())); - } else { - return GlobalStatistics.fromMapAssignment( - completedStatistics.checkpointId(), - MapAssignment.fromKeyFrequency( - downstreamParallelism, - completedStatistics.keyFrequency(), - closeFileCostWeightPercentage, - comparator)); - } - } - - @SuppressWarnings("FutureReturnValueIgnored") - private void sendGlobalStatisticsToSubtasks(GlobalStatistics statistics) { - runInCoordinatorThread( - () -> { - LOG.info( - "Broadcast latest global statistics from checkpoint {} to all subtasks", - statistics.checkpointId()); - // applyImmediately is set to false so that operator subtasks can - // apply the change at checkpoint boundary - StatisticsEvent statisticsEvent = - StatisticsEvent.createGlobalStatisticsEvent( - statistics, globalStatisticsSerializer, false); - for (int i = 0; i < context.currentParallelism(); ++i) { - // Ignore future return value for potential error (e.g. subtask down). - // Upon restart, subtasks send request to coordinator to refresh statistics - // if there is any difference - subtaskGateways.getSubtaskGateway(i).sendEvent(statisticsEvent); - } - }, - String.format( - Locale.ROOT, - "Failed to send operator %s coordinator global data statistics for checkpoint %d", - operatorName, - statistics.checkpointId())); - } - - @SuppressWarnings("FutureReturnValueIgnored") - private void handleRequestGlobalStatisticsEvent(int subtask, RequestGlobalStatisticsEvent event) { - if (globalStatistics != null) { - runInCoordinatorThread( - () -> { - if (event.signature() != null && event.signature() == globalStatistics.hashCode()) { - LOG.debug( - "Skip responding to statistics request from subtask {}, as the operator task already holds the same global statistics", - subtask); - } else { - LOG.info( - "Send latest global statistics from checkpoint {} to subtask {}", - globalStatistics.checkpointId(), - subtask); - StatisticsEvent statisticsEvent = - StatisticsEvent.createGlobalStatisticsEvent( - globalStatistics, globalStatisticsSerializer, true); - subtaskGateways.getSubtaskGateway(subtask).sendEvent(statisticsEvent); - } - }, - String.format( - Locale.ROOT, - "Failed to send operator %s coordinator global data statistics to requesting subtask %d for checkpoint %d", - operatorName, - subtask, - globalStatistics.checkpointId())); - } else { - LOG.info( - "Ignore global statistics request from subtask {} as statistics not available", subtask); - } - } - - @Override - public void handleEventFromOperator(int subtask, int attemptNumber, OperatorEvent event) { - runInCoordinatorThread( - () -> { - LOG.debug( - "Handling event from subtask {} (#{}) of {}: {}", - subtask, - attemptNumber, - operatorName, - event); - if (event instanceof StatisticsEvent) { - handleDataStatisticRequest(subtask, ((StatisticsEvent) event)); - } else if (event instanceof RequestGlobalStatisticsEvent) { - handleRequestGlobalStatisticsEvent(subtask, (RequestGlobalStatisticsEvent) event); - } else { - throw new IllegalArgumentException( - "Invalid operator event type: " + event.getClass().getCanonicalName()); - } - }, - String.format( - Locale.ROOT, - "handling operator event %s from subtask %d (#%d)", - event.getClass(), - subtask, - attemptNumber)); - } - - @Override - public void checkpointCoordinator(long checkpointId, CompletableFuture resultFuture) { - runInCoordinatorThread( - () -> { - LOG.debug( - "Snapshotting data statistics coordinator {} for checkpoint {}", - operatorName, - checkpointId); - if (completedStatistics == null) { - // null checkpoint result is not allowed, hence supply an empty byte array - resultFuture.complete(new byte[0]); - } else { - resultFuture.complete( - StatisticsUtil.serializeCompletedStatistics( - completedStatistics, completedStatisticsSerializer)); - } - }, - String.format(Locale.ROOT, "taking checkpoint %d", checkpointId)); - } - - @Override - public void notifyCheckpointComplete(long checkpointId) {} - - @Override - public void resetToCheckpoint(long checkpointId, byte[] checkpointData) { - Preconditions.checkState( - !started, "The coordinator %s can only be reset if it was not yet started", operatorName); - if (checkpointData == null || checkpointData.length == 0) { - LOG.info( - "Data statistic coordinator {} has nothing to restore from checkpoint {}", - operatorName, - checkpointId); - return; - } - - LOG.info( - "Restoring data statistic coordinator {} from checkpoint {}", operatorName, checkpointId); - this.completedStatistics = - StatisticsUtil.deserializeCompletedStatistics( - checkpointData, (CompletedStatisticsSerializer) completedStatisticsSerializer); - - // recompute global statistics in case downstream parallelism changed - this.globalStatistics = - globalStatistics( - completedStatistics, downstreamParallelism, comparator, closeFileCostWeightPercentage); - } - - @Override - public void subtaskReset(int subtask, long checkpointId) { - runInCoordinatorThread( - () -> { - LOG.info( - "Operator {} subtask {} is reset to checkpoint {}", - operatorName, - subtask, - checkpointId); - Preconditions.checkState( - this.coordinatorThreadFactory.isCurrentThreadCoordinatorThread()); - subtaskGateways.reset(subtask); - }, - String.format( - Locale.ROOT, "handling subtask %d recovery to checkpoint %d", subtask, checkpointId)); - } - - @Override - public void executionAttemptFailed(int subtask, int attemptNumber, @Nullable Throwable reason) { - runInCoordinatorThread( - () -> { - LOG.info( - "Unregistering gateway after failure for subtask {} (#{}) of data statistics {}", - subtask, - attemptNumber, - operatorName); - Preconditions.checkState( - this.coordinatorThreadFactory.isCurrentThreadCoordinatorThread()); - subtaskGateways.unregisterSubtaskGateway(subtask, attemptNumber); - }, - String.format(Locale.ROOT, "handling subtask %d (#%d) failure", subtask, attemptNumber)); - } - - @Override - public void executionAttemptReady(int subtask, int attemptNumber, SubtaskGateway gateway) { - Preconditions.checkArgument(subtask == gateway.getSubtask()); - Preconditions.checkArgument(attemptNumber == gateway.getExecution().getAttemptNumber()); - runInCoordinatorThread( - () -> { - Preconditions.checkState( - this.coordinatorThreadFactory.isCurrentThreadCoordinatorThread()); - subtaskGateways.registerSubtaskGateway(gateway); - }, - String.format( - Locale.ROOT, - "making event gateway to subtask %d (#%d) available", - subtask, - attemptNumber)); - } - - @VisibleForTesting - CompletedStatistics completedStatistics() { - return completedStatistics; - } - - @VisibleForTesting - GlobalStatistics globalStatistics() { - return globalStatistics; - } - - private static class SubtaskGateways { - private final String operatorName; - private final Map[] gateways; - - @SuppressWarnings("unchecked") - private SubtaskGateways(String operatorName, int parallelism) { - this.operatorName = operatorName; - gateways = new Map[parallelism]; - - for (int i = 0; i < parallelism; ++i) { - gateways[i] = Maps.newHashMap(); - } - } - - private void registerSubtaskGateway(OperatorCoordinator.SubtaskGateway gateway) { - int subtaskIndex = gateway.getSubtask(); - int attemptNumber = gateway.getExecution().getAttemptNumber(); - Preconditions.checkState( - !gateways[subtaskIndex].containsKey(attemptNumber), - "Coordinator of %s already has a subtask gateway for %d (#%d)", - operatorName, - subtaskIndex, - attemptNumber); - LOG.debug( - "Coordinator of {} registers gateway for subtask {} attempt {}", - operatorName, - subtaskIndex, - attemptNumber); - gateways[subtaskIndex].put(attemptNumber, gateway); - } - - private void unregisterSubtaskGateway(int subtaskIndex, int attemptNumber) { - LOG.debug( - "Coordinator of {} unregisters gateway for subtask {} attempt {}", - operatorName, - subtaskIndex, - attemptNumber); - gateways[subtaskIndex].remove(attemptNumber); - } - - private OperatorCoordinator.SubtaskGateway getSubtaskGateway(int subtaskIndex) { - Preconditions.checkState( - !gateways[subtaskIndex].isEmpty(), - "Coordinator of %s subtask %d is not ready yet to receive events", - operatorName, - subtaskIndex); - return Iterables.getOnlyElement(gateways[subtaskIndex].values()); - } - - private void reset(int subtaskIndex) { - gateways[subtaskIndex].clear(); - } - } - - private static class CoordinatorExecutorThreadFactory - implements ThreadFactory, Thread.UncaughtExceptionHandler { - - private final String coordinatorThreadName; - private final ClassLoader classLoader; - private final Thread.UncaughtExceptionHandler errorHandler; - - @javax.annotation.Nullable private Thread thread; - - CoordinatorExecutorThreadFactory( - final String coordinatorThreadName, final ClassLoader contextClassLoader) { - this(coordinatorThreadName, contextClassLoader, FatalExitExceptionHandler.INSTANCE); - } - - @org.apache.flink.annotation.VisibleForTesting - CoordinatorExecutorThreadFactory( - final String coordinatorThreadName, - final ClassLoader contextClassLoader, - final Thread.UncaughtExceptionHandler errorHandler) { - this.coordinatorThreadName = coordinatorThreadName; - this.classLoader = contextClassLoader; - this.errorHandler = errorHandler; - } - - @Override - public synchronized Thread newThread(@NotNull Runnable runnable) { - thread = new Thread(runnable, coordinatorThreadName); - thread.setContextClassLoader(classLoader); - thread.setUncaughtExceptionHandler(this); - return thread; - } - - @Override - public synchronized void uncaughtException(Thread t, Throwable e) { - errorHandler.uncaughtException(t, e); - } - - boolean isCurrentThreadCoordinatorThread() { - return Thread.currentThread() == thread; - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java deleted file mode 100644 index 9d7d989c298e..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.runtime.jobgraph.OperatorID; -import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; -import org.apache.flink.runtime.operators.coordination.RecreateOnResetOperatorCoordinator; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortOrder; - -/** - * DataStatisticsCoordinatorProvider provides the method to create new {@link - * DataStatisticsCoordinator} - */ -@Internal -public class DataStatisticsCoordinatorProvider extends RecreateOnResetOperatorCoordinator.Provider { - - private final String operatorName; - private final Schema schema; - private final SortOrder sortOrder; - private final int downstreamParallelism; - private final StatisticsType type; - private final double closeFileCostWeightPercentage; - - public DataStatisticsCoordinatorProvider( - String operatorName, - OperatorID operatorID, - Schema schema, - SortOrder sortOrder, - int downstreamParallelism, - StatisticsType type, - double closeFileCostWeightPercentage) { - super(operatorID); - this.operatorName = operatorName; - this.schema = schema; - this.sortOrder = sortOrder; - this.downstreamParallelism = downstreamParallelism; - this.type = type; - this.closeFileCostWeightPercentage = closeFileCostWeightPercentage; - } - - @Override - public OperatorCoordinator getCoordinator(OperatorCoordinator.Context context) { - return new DataStatisticsCoordinator( - operatorName, - context, - schema, - sortOrder, - downstreamParallelism, - type, - closeFileCostWeightPercentage); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java deleted file mode 100644 index 7995a8a5b181..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Map; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.state.ListState; -import org.apache.flink.api.common.state.ListStateDescriptor; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.runtime.operators.coordination.OperatorEvent; -import org.apache.flink.runtime.operators.coordination.OperatorEventGateway; -import org.apache.flink.runtime.operators.coordination.OperatorEventHandler; -import org.apache.flink.runtime.state.StateInitializationContext; -import org.apache.flink.runtime.state.StateSnapshotContext; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * DataStatisticsOperator collects traffic distribution statistics. A custom partitioner shall be - * attached to the DataStatisticsOperator output. The custom partitioner leverages the statistics to - * shuffle record to improve data clustering while maintaining relative balanced traffic - * distribution to downstream subtasks. - */ -@Internal -public class DataStatisticsOperator extends AbstractStreamOperator - implements OneInputStreamOperator, OperatorEventHandler { - - private static final long serialVersionUID = 1L; - - private final String operatorName; - private final RowDataWrapper rowDataWrapper; - private final SortKey sortKey; - private final OperatorEventGateway operatorEventGateway; - private final int downstreamParallelism; - private final StatisticsType statisticsType; - private final TypeSerializer taskStatisticsSerializer; - private final TypeSerializer globalStatisticsSerializer; - - private transient int parallelism; - private transient int subtaskIndex; - private transient ListState globalStatisticsState; - // current statistics type may be different from the config due to possible - // migration from Map statistics to Sketch statistics when high cardinality detected - private transient volatile StatisticsType taskStatisticsType; - private transient volatile DataStatistics localStatistics; - private transient volatile GlobalStatistics globalStatistics; - - DataStatisticsOperator( - String operatorName, - Schema schema, - SortOrder sortOrder, - OperatorEventGateway operatorEventGateway, - int downstreamParallelism, - StatisticsType statisticsType) { - this.operatorName = operatorName; - this.rowDataWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); - this.sortKey = new SortKey(schema, sortOrder); - this.operatorEventGateway = operatorEventGateway; - this.downstreamParallelism = downstreamParallelism; - this.statisticsType = statisticsType; - - SortKeySerializer sortKeySerializer = new SortKeySerializer(schema, sortOrder); - this.taskStatisticsSerializer = new DataStatisticsSerializer(sortKeySerializer); - this.globalStatisticsSerializer = new GlobalStatisticsSerializer(sortKeySerializer); - } - - @Override - public void initializeState(StateInitializationContext context) throws Exception { - this.parallelism = getRuntimeContext().getTaskInfo().getNumberOfParallelSubtasks(); - this.subtaskIndex = getRuntimeContext().getTaskInfo().getIndexOfThisSubtask(); - - // Use union state so that new subtasks can also restore global statistics during scale-up. - this.globalStatisticsState = - context - .getOperatorStateStore() - .getUnionListState( - new ListStateDescriptor<>("globalStatisticsState", globalStatisticsSerializer)); - - if (context.isRestored()) { - if (globalStatisticsState.get() == null - || !globalStatisticsState.get().iterator().hasNext()) { - LOG.info( - "Operator {} subtask {} doesn't have global statistics state to restore", - operatorName, - subtaskIndex); - // If Flink deprecates union state in the future, RequestGlobalStatisticsEvent can be - // leveraged to request global statistics from coordinator if new subtasks (scale-up case) - // has nothing to restore from. - } else { - GlobalStatistics restoredStatistics = globalStatisticsState.get().iterator().next(); - LOG.info( - "Operator {} subtask {} restored global statistics state", operatorName, subtaskIndex); - this.globalStatistics = restoredStatistics; - } - - // Always request for new statistics from coordinator upon task initialization. - // There are a few scenarios this is needed - // 1. downstream writer parallelism changed due to rescale. - // 2. coordinator failed to send the aggregated statistics to subtask - // (e.g. due to subtask failure at the time). - // Records may flow before coordinator can respond. Range partitioner should be - // able to continue to operate with potentially suboptimal behavior (in sketch case). - LOG.info( - "Operator {} subtask {} requests new global statistics from coordinator ", - operatorName, - subtaskIndex); - // coordinator can use the hashCode (if available) in the request event to determine - // if operator already has the latest global statistics and respond can be skipped. - // This makes the handling cheap in most situations. - RequestGlobalStatisticsEvent event = - globalStatistics != null - ? new RequestGlobalStatisticsEvent(globalStatistics.hashCode()) - : new RequestGlobalStatisticsEvent(); - operatorEventGateway.sendEventToCoordinator(event); - } - - this.taskStatisticsType = StatisticsUtil.collectType(statisticsType, globalStatistics); - this.localStatistics = - StatisticsUtil.createTaskStatistics(taskStatisticsType, parallelism, downstreamParallelism); - } - - @Override - public void open() throws Exception { - if (globalStatistics != null) { - output.collect(new StreamRecord<>(StatisticsOrRecord.fromStatistics(globalStatistics))); - } - } - - @Override - public void handleOperatorEvent(OperatorEvent event) { - Preconditions.checkArgument( - event instanceof StatisticsEvent, - String.format( - "Operator %s subtask %s received unexpected operator event %s", - operatorName, subtaskIndex, event.getClass())); - StatisticsEvent statisticsEvent = (StatisticsEvent) event; - LOG.info( - "Operator {} subtask {} received global data event from coordinator checkpoint {}", - operatorName, - subtaskIndex, - statisticsEvent.checkpointId()); - this.globalStatistics = - StatisticsUtil.deserializeGlobalStatistics( - statisticsEvent.statisticsBytes(), globalStatisticsSerializer); - checkStatisticsTypeMigration(); - // if applyImmediately not set, wait until the checkpoint time to switch - if (statisticsEvent.applyImmediately()) { - output.collect(new StreamRecord<>(StatisticsOrRecord.fromStatistics(globalStatistics))); - } - } - - @Override - public void processElement(StreamRecord streamRecord) { - // collect data statistics - RowData record = streamRecord.getValue(); - StructLike struct = rowDataWrapper.wrap(record); - sortKey.wrap(struct); - localStatistics.add(sortKey); - - checkStatisticsTypeMigration(); - output.collect(new StreamRecord<>(StatisticsOrRecord.fromRecord(record))); - } - - @Override - public void snapshotState(StateSnapshotContext context) throws Exception { - long checkpointId = context.getCheckpointId(); - LOG.info( - "Operator {} subtask {} snapshotting data statistics for checkpoint {}", - operatorName, - subtaskIndex, - checkpointId); - - // Pass global statistics to partitioner so that all the operators refresh statistics - // at same checkpoint barrier - if (globalStatistics != null) { - output.collect(new StreamRecord<>(StatisticsOrRecord.fromStatistics(globalStatistics))); - } - - // Only subtask 0 saves the state so that globalStatisticsState(UnionListState) stores - // an exact copy of globalStatistics - if (globalStatistics != null - && getRuntimeContext().getTaskInfo().getIndexOfThisSubtask() == 0) { - globalStatisticsState.clear(); - LOG.info( - "Operator {} subtask {} saving global statistics to state", operatorName, subtaskIndex); - globalStatisticsState.add(globalStatistics); - LOG.debug( - "Operator {} subtask {} saved global statistics to state: {}", - operatorName, - subtaskIndex, - globalStatistics); - } - - // For now, local statistics are sent to coordinator at checkpoint - LOG.info( - "Operator {} Subtask {} sending local statistics to coordinator for checkpoint {}", - operatorName, - subtaskIndex, - checkpointId); - operatorEventGateway.sendEventToCoordinator( - StatisticsEvent.createTaskStatisticsEvent( - checkpointId, localStatistics, taskStatisticsSerializer)); - - // Recreate the local statistics - localStatistics = - StatisticsUtil.createTaskStatistics(taskStatisticsType, parallelism, downstreamParallelism); - } - - @SuppressWarnings("unchecked") - private void checkStatisticsTypeMigration() { - // only check if the statisticsType config is Auto and localStatistics is currently Map type - if (statisticsType == StatisticsType.Auto && localStatistics.type() == StatisticsType.Map) { - Map mapStatistics = (Map) localStatistics.result(); - // convert if local statistics has cardinality over the threshold or - // if received global statistics is already sketch type - if (mapStatistics.size() > SketchUtil.OPERATOR_SKETCH_SWITCH_THRESHOLD - || (globalStatistics != null && globalStatistics.type() == StatisticsType.Sketch)) { - LOG.info( - "Operator {} subtask {} switched local statistics from Map to Sketch.", - operatorName, - subtaskIndex); - this.taskStatisticsType = StatisticsType.Sketch; - this.localStatistics = - StatisticsUtil.createTaskStatistics( - taskStatisticsType, parallelism, downstreamParallelism); - SketchUtil.convertMapToSketch(mapStatistics, localStatistics::add); - } - } - } - - @VisibleForTesting - DataStatistics localStatistics() { - return localStatistics; - } - - @VisibleForTesting - GlobalStatistics globalStatistics() { - return globalStatistics; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperatorFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperatorFactory.java deleted file mode 100644 index dc147bf36d13..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperatorFactory.java +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.runtime.jobgraph.OperatorID; -import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; -import org.apache.flink.runtime.operators.coordination.OperatorEventGateway; -import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; -import org.apache.flink.streaming.api.operators.CoordinatedOperatorFactory; -import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; -import org.apache.flink.streaming.api.operators.StreamOperator; -import org.apache.flink.streaming.api.operators.StreamOperatorParameters; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortOrder; - -@Internal -public class DataStatisticsOperatorFactory extends AbstractStreamOperatorFactory - implements CoordinatedOperatorFactory, - OneInputStreamOperatorFactory { - - private final Schema schema; - private final SortOrder sortOrder; - private final int downstreamParallelism; - private final StatisticsType type; - private final double closeFileCostWeightPercentage; - - public DataStatisticsOperatorFactory( - Schema schema, - SortOrder sortOrder, - int downstreamParallelism, - StatisticsType type, - double closeFileCostWeightPercentage) { - this.schema = schema; - this.sortOrder = sortOrder; - this.downstreamParallelism = downstreamParallelism; - this.type = type; - this.closeFileCostWeightPercentage = closeFileCostWeightPercentage; - } - - @Override - public OperatorCoordinator.Provider getCoordinatorProvider( - String operatorName, OperatorID operatorID) { - return new DataStatisticsCoordinatorProvider( - operatorName, - operatorID, - schema, - sortOrder, - downstreamParallelism, - type, - closeFileCostWeightPercentage); - } - - @SuppressWarnings("unchecked") - @Override - public > T createStreamOperator( - StreamOperatorParameters parameters) { - OperatorID operatorId = parameters.getStreamConfig().getOperatorID(); - String operatorName = parameters.getStreamConfig().getOperatorName(); - OperatorEventGateway gateway = - parameters.getOperatorEventDispatcher().getOperatorEventGateway(operatorId); - - DataStatisticsOperator rangeStatisticsOperator = - new DataStatisticsOperator( - operatorName, schema, sortOrder, gateway, downstreamParallelism, type); - - rangeStatisticsOperator.setup( - parameters.getContainingTask(), parameters.getStreamConfig(), parameters.getOutput()); - parameters - .getOperatorEventDispatcher() - .registerEventHandler(operatorId, rangeStatisticsOperator); - - return (T) rangeStatisticsOperator; - } - - @SuppressWarnings("rawtypes") - @Override - public Class getStreamOperatorClass(ClassLoader classLoader) { - return DataStatisticsOperator.class; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java deleted file mode 100644 index 8ce99073836d..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.io.IOException; -import java.util.Map; -import java.util.Objects; -import org.apache.datasketches.memory.Memory; -import org.apache.datasketches.sampling.ReservoirItemsSketch; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; -import org.apache.flink.api.common.typeutils.base.EnumSerializer; -import org.apache.flink.api.common.typeutils.base.LongSerializer; -import org.apache.flink.api.common.typeutils.base.MapSerializer; -import org.apache.flink.core.memory.DataInputView; -import org.apache.flink.core.memory.DataOutputView; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -@Internal -class DataStatisticsSerializer extends TypeSerializer { - private final TypeSerializer sortKeySerializer; - private final EnumSerializer statisticsTypeSerializer; - private final MapSerializer mapSerializer; - private final SortKeySketchSerializer sketchSerializer; - - DataStatisticsSerializer(TypeSerializer sortKeySerializer) { - this.sortKeySerializer = sortKeySerializer; - this.statisticsTypeSerializer = new EnumSerializer<>(StatisticsType.class); - this.mapSerializer = new MapSerializer<>(sortKeySerializer, LongSerializer.INSTANCE); - this.sketchSerializer = new SortKeySketchSerializer(sortKeySerializer); - } - - @Override - public boolean isImmutableType() { - return false; - } - - @SuppressWarnings("ReferenceEquality") - @Override - public TypeSerializer duplicate() { - TypeSerializer duplicateSortKeySerializer = sortKeySerializer.duplicate(); - return (duplicateSortKeySerializer == sortKeySerializer) - ? this - : new DataStatisticsSerializer(duplicateSortKeySerializer); - } - - @Override - public DataStatistics createInstance() { - return new MapDataStatistics(); - } - - @SuppressWarnings("unchecked") - @Override - public DataStatistics copy(DataStatistics obj) { - StatisticsType statisticsType = obj.type(); - if (statisticsType == StatisticsType.Map) { - MapDataStatistics from = (MapDataStatistics) obj; - Map fromStats = (Map) from.result(); - Map toStats = Maps.newHashMap(fromStats); - return new MapDataStatistics(toStats); - } else if (statisticsType == StatisticsType.Sketch) { - // because ReservoirItemsSketch doesn't expose enough public methods for cloning, - // this implementation adopted the less efficient serialization and deserialization. - SketchDataStatistics from = (SketchDataStatistics) obj; - ReservoirItemsSketch fromStats = (ReservoirItemsSketch) from.result(); - byte[] bytes = fromStats.toByteArray(sketchSerializer); - Memory memory = Memory.wrap(bytes); - ReservoirItemsSketch toStats = - ReservoirItemsSketch.heapify(memory, sketchSerializer); - return new SketchDataStatistics(toStats); - } else { - throw new IllegalArgumentException("Unsupported data statistics type: " + statisticsType); - } - } - - @Override - public DataStatistics copy(DataStatistics from, DataStatistics reuse) { - // not much benefit to reuse - return copy(from); - } - - @Override - public int getLength() { - return -1; - } - - @SuppressWarnings("unchecked") - @Override - public void serialize(DataStatistics obj, DataOutputView target) throws IOException { - StatisticsType statisticsType = obj.type(); - statisticsTypeSerializer.serialize(obj.type(), target); - if (statisticsType == StatisticsType.Map) { - Map mapStatistics = (Map) obj.result(); - mapSerializer.serialize(mapStatistics, target); - } else if (statisticsType == StatisticsType.Sketch) { - ReservoirItemsSketch sketch = (ReservoirItemsSketch) obj.result(); - byte[] sketchBytes = sketch.toByteArray(sketchSerializer); - target.writeInt(sketchBytes.length); - target.write(sketchBytes); - } else { - throw new IllegalArgumentException("Unsupported data statistics type: " + statisticsType); - } - } - - @Override - public DataStatistics deserialize(DataInputView source) throws IOException { - StatisticsType statisticsType = statisticsTypeSerializer.deserialize(source); - if (statisticsType == StatisticsType.Map) { - Map mapStatistics = mapSerializer.deserialize(source); - return new MapDataStatistics(mapStatistics); - } else if (statisticsType == StatisticsType.Sketch) { - int numBytes = source.readInt(); - byte[] sketchBytes = new byte[numBytes]; - source.read(sketchBytes); - Memory sketchMemory = Memory.wrap(sketchBytes); - ReservoirItemsSketch sketch = - ReservoirItemsSketch.heapify(sketchMemory, sketchSerializer); - return new SketchDataStatistics(sketch); - } else { - throw new IllegalArgumentException("Unsupported data statistics type: " + statisticsType); - } - } - - @Override - public DataStatistics deserialize(DataStatistics reuse, DataInputView source) throws IOException { - // not much benefit to reuse - return deserialize(source); - } - - @Override - public void copy(DataInputView source, DataOutputView target) throws IOException { - serialize(deserialize(source), target); - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof DataStatisticsSerializer)) { - return false; - } - - DataStatisticsSerializer other = (DataStatisticsSerializer) obj; - return Objects.equals(sortKeySerializer, other.sortKeySerializer); - } - - @Override - public int hashCode() { - return sortKeySerializer.hashCode(); - } - - @Override - public TypeSerializerSnapshot snapshotConfiguration() { - return new DataStatisticsSerializerSnapshot(this); - } - - public static class DataStatisticsSerializerSnapshot - extends CompositeTypeSerializerSnapshot { - private static final int CURRENT_VERSION = 1; - - /** Constructor for read instantiation. */ - @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) - public DataStatisticsSerializerSnapshot() {} - - @SuppressWarnings("checkstyle:RedundantModifier") - public DataStatisticsSerializerSnapshot(DataStatisticsSerializer serializer) { - super(serializer); - } - - @Override - protected int getCurrentOuterSnapshotVersion() { - return CURRENT_VERSION; - } - - @Override - protected TypeSerializer[] getNestedSerializers(DataStatisticsSerializer outerSerializer) { - return new TypeSerializer[] {outerSerializer.sortKeySerializer}; - } - - @Override - protected DataStatisticsSerializer createOuterSerializerWithNestedSerializers( - TypeSerializer[] nestedSerializers) { - SortKeySerializer sortKeySerializer = (SortKeySerializer) nestedSerializers[0]; - return new DataStatisticsSerializer(sortKeySerializer); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java deleted file mode 100644 index 50ec23e9f7a2..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Arrays; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Objects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * This is used by {@link RangePartitioner} for guiding range partitioning. This is what is sent to - * the operator subtasks. For sketch statistics, it only contains much smaller range bounds than the - * complete raw samples. - */ -class GlobalStatistics { - private final long checkpointId; - private final StatisticsType type; - private final MapAssignment mapAssignment; - private final SortKey[] rangeBounds; - - private transient Integer hashCode; - - GlobalStatistics( - long checkpointId, StatisticsType type, MapAssignment mapAssignment, SortKey[] rangeBounds) { - Preconditions.checkArgument( - (mapAssignment != null && rangeBounds == null) - || (mapAssignment == null && rangeBounds != null), - "Invalid key assignment or range bounds: both are non-null or null"); - this.checkpointId = checkpointId; - this.type = type; - this.mapAssignment = mapAssignment; - this.rangeBounds = rangeBounds; - } - - static GlobalStatistics fromMapAssignment(long checkpointId, MapAssignment mapAssignment) { - return new GlobalStatistics(checkpointId, StatisticsType.Map, mapAssignment, null); - } - - static GlobalStatistics fromRangeBounds(long checkpointId, SortKey[] rangeBounds) { - return new GlobalStatistics(checkpointId, StatisticsType.Sketch, null, rangeBounds); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("checkpointId", checkpointId) - .add("type", type) - .add("mapAssignment", mapAssignment) - .add("rangeBounds", rangeBounds) - .toString(); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (!(o instanceof GlobalStatistics)) { - return false; - } - - GlobalStatistics other = (GlobalStatistics) o; - return Objects.equal(checkpointId, other.checkpointId) - && Objects.equal(type, other.type) - && Objects.equal(mapAssignment, other.mapAssignment()) - && Arrays.equals(rangeBounds, other.rangeBounds()); - } - - @Override - public int hashCode() { - // implemented caching because coordinator can call the hashCode many times. - // when subtasks request statistics refresh upon initialization for reconciliation purpose, - // hashCode is used to check if there is any difference btw coordinator and operator state. - if (hashCode == null) { - this.hashCode = Objects.hashCode(checkpointId, type, mapAssignment, rangeBounds); - } - - return hashCode; - } - - long checkpointId() { - return checkpointId; - } - - StatisticsType type() { - return type; - } - - MapAssignment mapAssignment() { - return mapAssignment; - } - - SortKey[] rangeBounds() { - return rangeBounds; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java deleted file mode 100644 index a7fe2b30b865..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.io.IOException; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; -import org.apache.flink.api.common.typeutils.base.EnumSerializer; -import org.apache.flink.api.common.typeutils.base.IntSerializer; -import org.apache.flink.api.common.typeutils.base.ListSerializer; -import org.apache.flink.api.common.typeutils.base.LongSerializer; -import org.apache.flink.core.memory.DataInputView; -import org.apache.flink.core.memory.DataOutputView; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -class GlobalStatisticsSerializer extends TypeSerializer { - private final TypeSerializer sortKeySerializer; - private final EnumSerializer statisticsTypeSerializer; - private final ListSerializer rangeBoundsSerializer; - private final ListSerializer intsSerializer; - private final ListSerializer longsSerializer; - - GlobalStatisticsSerializer(TypeSerializer sortKeySerializer) { - this.sortKeySerializer = sortKeySerializer; - this.statisticsTypeSerializer = new EnumSerializer<>(StatisticsType.class); - this.rangeBoundsSerializer = new ListSerializer<>(sortKeySerializer); - this.intsSerializer = new ListSerializer<>(IntSerializer.INSTANCE); - this.longsSerializer = new ListSerializer<>(LongSerializer.INSTANCE); - } - - @Override - public boolean isImmutableType() { - return false; - } - - @Override - public TypeSerializer duplicate() { - return new GlobalStatisticsSerializer(sortKeySerializer); - } - - @Override - public GlobalStatistics createInstance() { - return GlobalStatistics.fromRangeBounds(0L, new SortKey[0]); - } - - @Override - public GlobalStatistics copy(GlobalStatistics from) { - return new GlobalStatistics( - from.checkpointId(), from.type(), from.mapAssignment(), from.rangeBounds()); - } - - @Override - public GlobalStatistics copy(GlobalStatistics from, GlobalStatistics reuse) { - // no benefit of reuse - return copy(from); - } - - @Override - public int getLength() { - return -1; - } - - @Override - public void serialize(GlobalStatistics record, DataOutputView target) throws IOException { - target.writeLong(record.checkpointId()); - statisticsTypeSerializer.serialize(record.type(), target); - if (record.type() == StatisticsType.Map) { - MapAssignment mapAssignment = record.mapAssignment(); - target.writeInt(mapAssignment.numPartitions()); - target.writeInt(mapAssignment.keyAssignments().size()); - for (Map.Entry entry : mapAssignment.keyAssignments().entrySet()) { - sortKeySerializer.serialize(entry.getKey(), target); - KeyAssignment keyAssignment = entry.getValue(); - intsSerializer.serialize(keyAssignment.assignedSubtasks(), target); - longsSerializer.serialize(keyAssignment.subtaskWeightsWithCloseFileCost(), target); - target.writeLong(keyAssignment.closeFileCostWeight()); - } - } else { - rangeBoundsSerializer.serialize(Arrays.asList(record.rangeBounds()), target); - } - } - - @Override - public GlobalStatistics deserialize(DataInputView source) throws IOException { - long checkpointId = source.readLong(); - StatisticsType type = statisticsTypeSerializer.deserialize(source); - if (type == StatisticsType.Map) { - int numPartitions = source.readInt(); - int mapSize = source.readInt(); - Map keyAssignments = Maps.newHashMapWithExpectedSize(mapSize); - for (int i = 0; i < mapSize; ++i) { - SortKey sortKey = sortKeySerializer.deserialize(source); - List assignedSubtasks = intsSerializer.deserialize(source); - List subtaskWeightsWithCloseFileCost = longsSerializer.deserialize(source); - long closeFileCostWeight = source.readLong(); - keyAssignments.put( - sortKey, - new KeyAssignment( - assignedSubtasks, subtaskWeightsWithCloseFileCost, closeFileCostWeight)); - } - - return GlobalStatistics.fromMapAssignment( - checkpointId, new MapAssignment(numPartitions, keyAssignments)); - } else { - List sortKeys = rangeBoundsSerializer.deserialize(source); - SortKey[] rangeBounds = new SortKey[sortKeys.size()]; - return GlobalStatistics.fromRangeBounds(checkpointId, sortKeys.toArray(rangeBounds)); - } - } - - @Override - public GlobalStatistics deserialize(GlobalStatistics reuse, DataInputView source) - throws IOException { - // not much benefit to reuse - return deserialize(source); - } - - @Override - public void copy(DataInputView source, DataOutputView target) throws IOException { - serialize(deserialize(source), target); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - - if (obj == null || getClass() != obj.getClass()) { - return false; - } - - GlobalStatisticsSerializer other = (GlobalStatisticsSerializer) obj; - return Objects.equals(sortKeySerializer, other.sortKeySerializer); - } - - @Override - public int hashCode() { - return sortKeySerializer.hashCode(); - } - - @Override - public TypeSerializerSnapshot snapshotConfiguration() { - return new GlobalStatisticsSerializerSnapshot(this); - } - - public static class GlobalStatisticsSerializerSnapshot - extends CompositeTypeSerializerSnapshot { - private static final int CURRENT_VERSION = 1; - - /** Constructor for read instantiation. */ - @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) - public GlobalStatisticsSerializerSnapshot() {} - - @SuppressWarnings("checkstyle:RedundantModifier") - public GlobalStatisticsSerializerSnapshot(GlobalStatisticsSerializer serializer) { - super(serializer); - } - - @Override - protected int getCurrentOuterSnapshotVersion() { - return CURRENT_VERSION; - } - - @Override - protected TypeSerializer[] getNestedSerializers(GlobalStatisticsSerializer outerSerializer) { - return new TypeSerializer[] {outerSerializer.sortKeySerializer}; - } - - @Override - protected GlobalStatisticsSerializer createOuterSerializerWithNestedSerializers( - TypeSerializer[] nestedSerializers) { - SortKeySerializer sortKeySerializer = (SortKeySerializer) nestedSerializers[0]; - return new GlobalStatisticsSerializer(sortKeySerializer); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java deleted file mode 100644 index 781bcc646023..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Arrays; -import java.util.List; -import java.util.Objects; -import java.util.concurrent.ThreadLocalRandom; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** Subtask assignment for a key for Map statistics based */ -class KeyAssignment { - private final List assignedSubtasks; - private final List subtaskWeightsWithCloseFileCost; - private final long closeFileCostWeight; - private final long[] subtaskWeightsExcludingCloseCost; - private final long keyWeight; - private final long[] cumulativeWeights; - - /** - * @param assignedSubtasks assigned subtasks for this key. It could be a single subtask. It could - * also be multiple subtasks if the key has heavy weight that should be handled by multiple - * subtasks. - * @param subtaskWeightsWithCloseFileCost assigned weight for each subtask. E.g., if the keyWeight - * is 27 and the key is assigned to 3 subtasks, subtaskWeights could contain values as [10, - * 10, 7] for target weight of 10 per subtask. - */ - KeyAssignment( - List assignedSubtasks, - List subtaskWeightsWithCloseFileCost, - long closeFileCostWeight) { - Preconditions.checkArgument( - assignedSubtasks != null && !assignedSubtasks.isEmpty(), - "Invalid assigned subtasks: null or empty"); - Preconditions.checkArgument( - subtaskWeightsWithCloseFileCost != null && !subtaskWeightsWithCloseFileCost.isEmpty(), - "Invalid assigned subtasks weights: null or empty"); - Preconditions.checkArgument( - assignedSubtasks.size() == subtaskWeightsWithCloseFileCost.size(), - "Invalid assignment: size mismatch (tasks length = %s, weights length = %s)", - assignedSubtasks.size(), - subtaskWeightsWithCloseFileCost.size()); - subtaskWeightsWithCloseFileCost.forEach( - weight -> - Preconditions.checkArgument( - weight > closeFileCostWeight, - "Invalid weight: should be larger than close file cost: weight = %s, close file cost = %s", - weight, - closeFileCostWeight)); - - this.assignedSubtasks = assignedSubtasks; - this.subtaskWeightsWithCloseFileCost = subtaskWeightsWithCloseFileCost; - this.closeFileCostWeight = closeFileCostWeight; - // Exclude the close file cost for key routing - this.subtaskWeightsExcludingCloseCost = - subtaskWeightsWithCloseFileCost.stream() - .mapToLong(weightWithCloseFileCost -> weightWithCloseFileCost - closeFileCostWeight) - .toArray(); - this.keyWeight = Arrays.stream(subtaskWeightsExcludingCloseCost).sum(); - this.cumulativeWeights = new long[subtaskWeightsExcludingCloseCost.length]; - long cumulativeWeight = 0; - for (int i = 0; i < subtaskWeightsExcludingCloseCost.length; ++i) { - cumulativeWeight += subtaskWeightsExcludingCloseCost[i]; - cumulativeWeights[i] = cumulativeWeight; - } - } - - List assignedSubtasks() { - return assignedSubtasks; - } - - List subtaskWeightsWithCloseFileCost() { - return subtaskWeightsWithCloseFileCost; - } - - long closeFileCostWeight() { - return closeFileCostWeight; - } - - long[] subtaskWeightsExcludingCloseCost() { - return subtaskWeightsExcludingCloseCost; - } - - /** - * Select a subtask for the key. - * - * @return subtask id - */ - int select() { - if (assignedSubtasks.size() == 1) { - // only choice. no need to run random number generator. - return assignedSubtasks.get(0); - } else { - long randomNumber = ThreadLocalRandom.current().nextLong(keyWeight); - int index = Arrays.binarySearch(cumulativeWeights, randomNumber); - // choose the subtask where randomNumber < cumulativeWeights[pos]. - // this works regardless whether index is negative or not. - int position = Math.abs(index + 1); - Preconditions.checkState( - position < assignedSubtasks.size(), - "Invalid selected position: out of range. key weight = %s, random number = %s, cumulative weights array = %s", - keyWeight, - randomNumber, - cumulativeWeights); - return assignedSubtasks.get(position); - } - } - - @Override - public int hashCode() { - return Objects.hash(assignedSubtasks, subtaskWeightsWithCloseFileCost, closeFileCostWeight); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (o == null || getClass() != o.getClass()) { - return false; - } - - KeyAssignment that = (KeyAssignment) o; - return Objects.equals(assignedSubtasks, that.assignedSubtasks) - && Objects.equals(subtaskWeightsWithCloseFileCost, that.subtaskWeightsWithCloseFileCost) - && closeFileCostWeight == that.closeFileCostWeight; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("assignedSubtasks", assignedSubtasks) - .add("subtaskWeightsWithCloseFileCost", subtaskWeightsWithCloseFileCost) - .add("closeFileCostWeight", closeFileCostWeight) - .toString(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java deleted file mode 100644 index 9d8167460a1b..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java +++ /dev/null @@ -1,242 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Comparator; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.NavigableMap; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Objects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.Pair; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** Key assignment to subtasks for Map statistics. */ -class MapAssignment { - private static final Logger LOG = LoggerFactory.getLogger(MapAssignment.class); - - private final int numPartitions; - private final Map keyAssignments; - - MapAssignment(int numPartitions, Map keyAssignments) { - Preconditions.checkArgument(keyAssignments != null, "Invalid key assignments: null"); - this.numPartitions = numPartitions; - this.keyAssignments = keyAssignments; - } - - static MapAssignment fromKeyFrequency( - int numPartitions, - Map mapStatistics, - double closeFileCostWeightPercentage, - Comparator comparator) { - return new MapAssignment( - numPartitions, - assignment(numPartitions, mapStatistics, closeFileCostWeightPercentage, comparator)); - } - - @Override - public int hashCode() { - return Objects.hashCode(numPartitions, keyAssignments); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (o == null || getClass() != o.getClass()) { - return false; - } - - MapAssignment that = (MapAssignment) o; - return numPartitions == that.numPartitions && keyAssignments.equals(that.keyAssignments); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("numPartitions", numPartitions) - .add("keyAssignments", keyAssignments) - .toString(); - } - - int numPartitions() { - return numPartitions; - } - - Map keyAssignments() { - return keyAssignments; - } - - /** - * Returns assignment summary for every subtask. - * - * @return assignment summary for every subtask. Key is subtaskId. Value pair is (weight assigned - * to the subtask, number of keys assigned to the subtask) - */ - Map> assignmentInfo() { - Map> assignmentInfo = Maps.newTreeMap(); - keyAssignments.forEach( - (key, keyAssignment) -> { - for (int i = 0; i < keyAssignment.assignedSubtasks().size(); ++i) { - int subtaskId = keyAssignment.assignedSubtasks().get(i); - long subtaskWeight = keyAssignment.subtaskWeightsExcludingCloseCost()[i]; - Pair oldValue = assignmentInfo.getOrDefault(subtaskId, Pair.of(0L, 0)); - assignmentInfo.put( - subtaskId, Pair.of(oldValue.first() + subtaskWeight, oldValue.second() + 1)); - } - }); - - return assignmentInfo; - } - - static Map assignment( - int numPartitions, - Map mapStatistics, - double closeFileCostWeightPercentage, - Comparator comparator) { - mapStatistics.forEach( - (key, value) -> - Preconditions.checkArgument( - value > 0, "Invalid statistics: weight is 0 for key %s", key)); - - long totalWeight = mapStatistics.values().stream().mapToLong(l -> l).sum(); - double targetWeightPerSubtask = ((double) totalWeight) / numPartitions; - long closeFileCostWeight = - (long) Math.ceil(targetWeightPerSubtask * closeFileCostWeightPercentage / 100); - - NavigableMap sortedStatsWithCloseFileCost = Maps.newTreeMap(comparator); - mapStatistics.forEach( - (k, v) -> { - int estimatedSplits = (int) Math.ceil(v / targetWeightPerSubtask); - long estimatedCloseFileCost = closeFileCostWeight * estimatedSplits; - sortedStatsWithCloseFileCost.put(k, v + estimatedCloseFileCost); - }); - - long totalWeightWithCloseFileCost = - sortedStatsWithCloseFileCost.values().stream().mapToLong(l -> l).sum(); - long targetWeightPerSubtaskWithCloseFileCost = - (long) Math.ceil(((double) totalWeightWithCloseFileCost) / numPartitions); - return buildAssignment( - numPartitions, - sortedStatsWithCloseFileCost, - targetWeightPerSubtaskWithCloseFileCost, - closeFileCostWeight); - } - - private static Map buildAssignment( - int numPartitions, - NavigableMap sortedStatistics, - long targetWeightPerSubtask, - long closeFileCostWeight) { - Map assignmentMap = - Maps.newHashMapWithExpectedSize(sortedStatistics.size()); - Iterator mapKeyIterator = sortedStatistics.keySet().iterator(); - int subtaskId = 0; - SortKey currentKey = null; - long keyRemainingWeight = 0L; - long subtaskRemainingWeight = targetWeightPerSubtask; - List assignedSubtasks = Lists.newArrayList(); - List subtaskWeights = Lists.newArrayList(); - while (mapKeyIterator.hasNext() || currentKey != null) { - // This should never happen because target weight is calculated using ceil function. - if (subtaskId >= numPartitions) { - LOG.error( - "Internal algorithm error: exhausted subtasks with unassigned keys left. number of partitions: {}, " - + "target weight per subtask: {}, close file cost in weight: {}, data statistics: {}", - numPartitions, - targetWeightPerSubtask, - closeFileCostWeight, - sortedStatistics); - throw new IllegalStateException( - "Internal algorithm error: exhausted subtasks with unassigned keys left"); - } - - if (currentKey == null) { - currentKey = mapKeyIterator.next(); - keyRemainingWeight = sortedStatistics.get(currentKey); - } - - assignedSubtasks.add(subtaskId); - if (keyRemainingWeight < subtaskRemainingWeight) { - // assign the remaining weight of the key to the current subtask - subtaskWeights.add(keyRemainingWeight); - subtaskRemainingWeight -= keyRemainingWeight; - keyRemainingWeight = 0L; - } else { - // filled up the current subtask - long assignedWeight = subtaskRemainingWeight; - keyRemainingWeight -= subtaskRemainingWeight; - - // If assigned weight is less than close file cost, pad it up with close file cost. - // This might cause the subtask assigned weight over the target weight. - // But it should be no more than one close file cost. Small skew is acceptable. - if (assignedWeight <= closeFileCostWeight) { - long paddingWeight = Math.min(keyRemainingWeight, closeFileCostWeight); - keyRemainingWeight -= paddingWeight; - assignedWeight += paddingWeight; - } - - subtaskWeights.add(assignedWeight); - // move on to the next subtask - subtaskId += 1; - subtaskRemainingWeight = targetWeightPerSubtask; - } - - Preconditions.checkState( - assignedSubtasks.size() == subtaskWeights.size(), - "List size mismatch: assigned subtasks = %s, subtask weights = %s", - assignedSubtasks, - subtaskWeights); - - // If the remaining key weight is smaller than the close file cost, simply skip the residual - // as it doesn't make sense to assign a weight smaller than close file cost to a new subtask. - // this might lead to some inaccuracy in weight calculation. E.g., assuming the key weight is - // 2 and close file cost is 2. key weight with close cost is 4. Let's assume the previous - // task has a weight of 3 available. So weight of 3 for this key is assigned to the task and - // the residual weight of 1 is dropped. Then the routing weight for this key is 1 (minus the - // close file cost), which is inaccurate as the true key weight should be 2. - // Again, this greedy algorithm is not intended to be perfect. Some small inaccuracy is - // expected and acceptable. Traffic distribution should still be balanced. - if (keyRemainingWeight > 0 && keyRemainingWeight <= closeFileCostWeight) { - keyRemainingWeight = 0; - } - - if (keyRemainingWeight == 0) { - // finishing up the assignment for the current key - KeyAssignment keyAssignment = - new KeyAssignment(assignedSubtasks, subtaskWeights, closeFileCostWeight); - assignmentMap.put(currentKey, keyAssignment); - assignedSubtasks = Lists.newArrayList(); - subtaskWeights = Lists.newArrayList(); - currentKey = null; - } - } - - return assignmentMap; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java deleted file mode 100644 index 05b943f6046f..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Map; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Objects; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -/** MapDataStatistics uses map to count key frequency */ -class MapDataStatistics implements DataStatistics { - private final Map keyFrequency; - - MapDataStatistics() { - this.keyFrequency = Maps.newHashMap(); - } - - MapDataStatistics(Map keyFrequency) { - this.keyFrequency = keyFrequency; - } - - @Override - public StatisticsType type() { - return StatisticsType.Map; - } - - @Override - public boolean isEmpty() { - return keyFrequency.isEmpty(); - } - - @Override - public void add(SortKey sortKey) { - if (keyFrequency.containsKey(sortKey)) { - keyFrequency.merge(sortKey, 1L, Long::sum); - } else { - // clone the sort key before adding to map because input sortKey object can be reused - SortKey copiedKey = sortKey.copy(); - keyFrequency.put(copiedKey, 1L); - } - } - - @Override - public Object result() { - return keyFrequency; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this).add("map", keyFrequency).toString(); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (!(o instanceof MapDataStatistics)) { - return false; - } - - MapDataStatistics other = (MapDataStatistics) o; - return Objects.equal(keyFrequency, other.keyFrequency); - } - - @Override - public int hashCode() { - return Objects.hashCode(keyFrequency); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java deleted file mode 100644 index f36a078c94e0..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.concurrent.TimeUnit; -import org.apache.flink.api.common.functions.Partitioner; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Internal partitioner implementation that supports MapDataStatistics, which is typically used for - * low-cardinality use cases. While MapDataStatistics can keep accurate counters, it can't be used - * for high-cardinality use cases. Otherwise, the memory footprint is too high. - * - *

    It is a greedy algorithm for bin packing. With close file cost, the calculation isn't always - * precise when calculating close cost for every file, target weight per subtask, padding residual - * weight, assigned weight without close cost. - * - *

    All actions should be executed in a single Flink mailbox thread. So there is no need to make - * it thread safe. - */ -class MapRangePartitioner implements Partitioner { - private static final Logger LOG = LoggerFactory.getLogger(MapRangePartitioner.class); - - private final RowDataWrapper rowDataWrapper; - private final SortKey sortKey; - private final MapAssignment mapAssignment; - - // Counter that tracks how many times a new key encountered - // where there is no traffic statistics learned about it. - private long newSortKeyCounter; - private long lastNewSortKeyLogTimeMilli; - - MapRangePartitioner(Schema schema, SortOrder sortOrder, MapAssignment mapAssignment) { - this.rowDataWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); - this.sortKey = new SortKey(schema, sortOrder); - this.mapAssignment = mapAssignment; - this.newSortKeyCounter = 0; - this.lastNewSortKeyLogTimeMilli = System.currentTimeMillis(); - } - - @Override - public int partition(RowData row, int numPartitions) { - // reuse the sortKey and rowDataWrapper - sortKey.wrap(rowDataWrapper.wrap(row)); - KeyAssignment keyAssignment = mapAssignment.keyAssignments().get(sortKey); - - int partition; - if (keyAssignment == null) { - LOG.trace( - "Encountered new sort key: {}. Fall back to round robin as statistics not learned yet.", - sortKey); - // Ideally unknownKeyCounter should be published as a counter metric. - // It seems difficult to pass in MetricGroup into the partitioner. - // Just log an INFO message every minute. - newSortKeyCounter += 1; - long now = System.currentTimeMillis(); - if (now - lastNewSortKeyLogTimeMilli > TimeUnit.MINUTES.toMillis(1)) { - LOG.info( - "Encounter new sort keys {} times. Fall back to round robin as statistics not learned yet", - newSortKeyCounter); - lastNewSortKeyLogTimeMilli = now; - newSortKeyCounter = 0; - } - partition = (int) (newSortKeyCounter % numPartitions); - } else { - partition = keyAssignment.select(); - } - - return RangePartitioner.adjustPartitionWithRescale( - partition, mapAssignment.numPartitions(), numPartitions); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java deleted file mode 100644 index 6608b938f5a8..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Random; -import java.util.concurrent.atomic.AtomicLong; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.functions.Partitioner; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortOrder; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** This custom partitioner implements the {@link DistributionMode#RANGE} for Flink sink. */ -@Internal -public class RangePartitioner implements Partitioner { - private static final Logger LOG = LoggerFactory.getLogger(RangePartitioner.class); - - private final Schema schema; - private final SortOrder sortOrder; - - private transient AtomicLong roundRobinCounter; - private transient Partitioner delegatePartitioner; - - public RangePartitioner(Schema schema, SortOrder sortOrder) { - this.schema = schema; - this.sortOrder = sortOrder; - } - - @Override - public int partition(StatisticsOrRecord wrapper, int numPartitions) { - if (wrapper.hasStatistics()) { - this.delegatePartitioner = delegatePartitioner(wrapper.statistics()); - return (int) (roundRobinCounter(numPartitions).getAndIncrement() % numPartitions); - } else { - if (delegatePartitioner != null) { - return delegatePartitioner.partition(wrapper.record(), numPartitions); - } else { - int partition = (int) (roundRobinCounter(numPartitions).getAndIncrement() % numPartitions); - LOG.trace("Statistics not available. Round robin to partition {}", partition); - return partition; - } - } - } - - private AtomicLong roundRobinCounter(int numPartitions) { - if (roundRobinCounter == null) { - // randomize the starting point to avoid synchronization across subtasks - this.roundRobinCounter = new AtomicLong(new Random().nextInt(numPartitions)); - } - - return roundRobinCounter; - } - - private Partitioner delegatePartitioner(GlobalStatistics statistics) { - if (statistics.type() == StatisticsType.Map) { - return new MapRangePartitioner(schema, sortOrder, statistics.mapAssignment()); - } else if (statistics.type() == StatisticsType.Sketch) { - return new SketchRangePartitioner(schema, sortOrder, statistics.rangeBounds()); - } else { - throw new IllegalArgumentException( - String.format("Invalid statistics type: %s. Should be Map or Sketch", statistics.type())); - } - } - - /** - * Util method that handles rescale (write parallelism / numPartitions change). - * - * @param partition partition caculated based on the existing statistics - * @param numPartitionsStatsCalculation number of partitions when the assignment was calculated - * based on - * @param numPartitions current number of partitions - * @return adjusted partition if necessary. - */ - static int adjustPartitionWithRescale( - int partition, int numPartitionsStatsCalculation, int numPartitions) { - if (numPartitionsStatsCalculation <= numPartitions) { - // no rescale or scale-up case. - // new subtasks are ignored and not assigned any keys, which is sub-optimal and only - // transient. when rescale is detected, operator requests new statistics from - // coordinator upon initialization. - return partition; - } else { - // scale-down case. - // Use mod % operation to distribution the over-range partitions. - // It can cause skew among subtasks. but the behavior is still better than - // discarding the statistics and falling back to round-robin (no clustering). - // Again, this is transient and stats refresh is requested when rescale is detected. - return partition % numPartitions; - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java deleted file mode 100644 index ce94bec14860..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import org.apache.flink.runtime.operators.coordination.OperatorEvent; - -class RequestGlobalStatisticsEvent implements OperatorEvent { - private final Integer signature; - - RequestGlobalStatisticsEvent() { - this.signature = null; - } - - /** - * @param signature hashCode of the subtask's existing global statistics - */ - RequestGlobalStatisticsEvent(int signature) { - this.signature = signature; - } - - Integer signature() { - return signature; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java deleted file mode 100644 index efd87a883d78..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Arrays; -import org.apache.datasketches.sampling.ReservoirItemsSketch; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Objects; - -/** SketchDataStatistics uses reservoir sampling algorithm to count key frequency */ -class SketchDataStatistics implements DataStatistics { - - private final ReservoirItemsSketch sketch; - - SketchDataStatistics(int reservoirSize) { - this.sketch = ReservoirItemsSketch.newInstance(reservoirSize); - } - - SketchDataStatistics(ReservoirItemsSketch sketchStats) { - this.sketch = sketchStats; - } - - @Override - public StatisticsType type() { - return StatisticsType.Sketch; - } - - @Override - public boolean isEmpty() { - return sketch.getNumSamples() == 0; - } - - @Override - public void add(SortKey sortKey) { - // clone the sort key first because input sortKey object can be reused - SortKey copiedKey = sortKey.copy(); - sketch.update(copiedKey); - } - - @Override - public Object result() { - return sketch; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this).add("sketch", sketch).toString(); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (!(o instanceof SketchDataStatistics)) { - return false; - } - - ReservoirItemsSketch otherSketch = ((SketchDataStatistics) o).sketch; - return Objects.equal(sketch.getK(), otherSketch.getK()) - && Objects.equal(sketch.getN(), otherSketch.getN()) - && Arrays.deepEquals(sketch.getSamples(), otherSketch.getSamples()); - } - - @Override - public int hashCode() { - return Objects.hashCode(sketch.getK(), sketch.getN(), sketch.getSamples()); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java deleted file mode 100644 index dddb0d8722c0..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Comparator; -import org.apache.flink.api.common.functions.Partitioner; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.types.Comparators; - -class SketchRangePartitioner implements Partitioner { - private final SortKey sortKey; - private final Comparator comparator; - private final SortKey[] rangeBounds; - private final RowDataWrapper rowDataWrapper; - - SketchRangePartitioner(Schema schema, SortOrder sortOrder, SortKey[] rangeBounds) { - this.sortKey = new SortKey(schema, sortOrder); - this.comparator = Comparators.forType(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()); - this.rangeBounds = rangeBounds; - this.rowDataWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); - } - - @Override - public int partition(RowData row, int numPartitions) { - // reuse the sortKey and rowDataWrapper - sortKey.wrap(rowDataWrapper.wrap(row)); - return SketchUtil.partition(sortKey, numPartitions, rangeBounds, comparator); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java deleted file mode 100644 index 3d572b98d53f..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.function.Consumer; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -class SketchUtil { - static final int COORDINATOR_MIN_RESERVOIR_SIZE = 10_000; - static final int COORDINATOR_MAX_RESERVOIR_SIZE = 1_000_000; - static final int COORDINATOR_TARGET_PARTITIONS_MULTIPLIER = 100; - static final int OPERATOR_OVER_SAMPLE_RATIO = 10; - - // switch the statistics tracking from map to sketch if the cardinality of the sort key is over - // this threshold. It is hardcoded for now, we can revisit in the future if config is needed. - static final int OPERATOR_SKETCH_SWITCH_THRESHOLD = 10_000; - static final int COORDINATOR_SKETCH_SWITCH_THRESHOLD = 100_000; - - private SketchUtil() {} - - /** - * The larger the reservoir size, the more accurate for range bounds calculation and the more - * balanced range distribution. - * - *

    Here are the heuristic rules - *

  • Target size: numPartitions x 100 to achieve good accuracy and is easier to calculate the - * range bounds - *
  • Min is 10K to achieve good accuracy while memory footprint is still relatively small - *
  • Max is 1M to cap the memory footprint on coordinator - * - * @param numPartitions number of range partitions which equals to downstream operator parallelism - * @return reservoir size - */ - static int determineCoordinatorReservoirSize(int numPartitions) { - int reservoirSize = numPartitions * COORDINATOR_TARGET_PARTITIONS_MULTIPLIER; - - if (reservoirSize < COORDINATOR_MIN_RESERVOIR_SIZE) { - // adjust it up and still make reservoirSize divisible by numPartitions - int remainder = COORDINATOR_MIN_RESERVOIR_SIZE % numPartitions; - reservoirSize = COORDINATOR_MIN_RESERVOIR_SIZE + (numPartitions - remainder); - } else if (reservoirSize > COORDINATOR_MAX_RESERVOIR_SIZE) { - // adjust it down and still make reservoirSize divisible by numPartitions - int remainder = COORDINATOR_MAX_RESERVOIR_SIZE % numPartitions; - reservoirSize = COORDINATOR_MAX_RESERVOIR_SIZE - remainder; - } - - return reservoirSize; - } - - /** - * Determine the sampling reservoir size where operator subtasks collect data statistics. - * - *

    Here are the heuristic rules - *

  • Target size is "coordinator reservoir size * over sampling ration (10) / operator - * parallelism" - *
  • Min is 1K to achieve good accuracy while memory footprint is still relatively small - *
  • Max is 100K to cap the memory footprint on coordinator - * - * @param numPartitions number of range partitions which equals to downstream operator parallelism - * @param operatorParallelism data statistics operator parallelism - * @return reservoir size - */ - static int determineOperatorReservoirSize(int operatorParallelism, int numPartitions) { - int coordinatorReservoirSize = determineCoordinatorReservoirSize(numPartitions); - int totalOperatorSamples = coordinatorReservoirSize * OPERATOR_OVER_SAMPLE_RATIO; - return (int) Math.ceil((double) totalOperatorSamples / operatorParallelism); - } - - /** - * To understand how range bounds are used in range partitioning, here is an example for human - * ages with 4 partitions: [15, 32, 60]. The 4 ranges would be - * - *
      - *
    • age <= 15 - *
    • age > 15 && age <= 32 - *
    • age >32 && age <= 60 - *
    • age > 60 - *
    - * - *

    Assumption is that a single key is not dominant enough to span multiple subtasks. - * - * @param numPartitions number of partitions which maps to downstream operator parallelism - * @param samples sampled keys - * @return array of range partition bounds. It should be a sorted list (ascending). Number of - * items should be {@code numPartitions - 1}. if numPartitions is 1, return an empty list - */ - static SortKey[] rangeBounds( - int numPartitions, Comparator comparator, SortKey[] samples) { - // sort the keys first - Arrays.sort(samples, comparator); - int numCandidates = numPartitions - 1; - List candidatesList = Lists.newLinkedList(); - int step = (int) Math.ceil((double) samples.length / numPartitions); - int position = step - 1; - int numChosen = 0; - while (position < samples.length && numChosen < numCandidates) { - SortKey candidate = samples[position]; - // skip duplicate values - if (numChosen > 0 && candidate.equals(candidatesList.get(candidatesList.size() - 1))) { - // linear probe for the next distinct value - position += 1; - } else { - candidatesList.add(candidate); - position += step; - numChosen += 1; - } - } - SortKey[] candidates = candidatesList.toArray(new SortKey[0]); - return candidates; - } - - /** This can be a bit expensive since it is quadratic. */ - static void convertMapToSketch( - Map taskMapStats, Consumer sketchConsumer) { - taskMapStats.forEach( - (sortKey, count) -> { - for (int i = 0; i < count; ++i) { - sketchConsumer.accept(sortKey); - } - }); - } - - static int partition( - SortKey key, int numPartitions, SortKey[] rangeBounds, Comparator comparator) { - int partition = Arrays.binarySearch(rangeBounds, key, comparator); - - // binarySearch either returns the match location or -[insertion point]-1 - if (partition < 0) { - partition = -partition - 1; - } - - if (partition > rangeBounds.length) { - partition = rangeBounds.length; - } - - return RangePartitioner.adjustPartitionWithRescale( - partition, rangeBounds.length + 1, numPartitions); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java deleted file mode 100644 index acd078a61cd3..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java +++ /dev/null @@ -1,411 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.io.IOException; -import java.math.BigDecimal; -import java.math.BigInteger; -import java.nio.ByteBuffer; -import java.util.List; -import java.util.Locale; -import java.util.Objects; -import java.util.Set; -import java.util.UUID; -import java.util.stream.Collectors; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.api.common.typeutils.TypeSerializerSchemaCompatibility; -import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; -import org.apache.flink.core.memory.DataInputView; -import org.apache.flink.core.memory.DataOutputView; -import org.apache.flink.util.StringUtils; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SchemaParser; -import org.apache.iceberg.SortField; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.SortOrderParser; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.types.CheckCompatibility; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; - -class SortKeySerializer extends TypeSerializer { - private final Schema schema; - private final SortOrder sortOrder; - private final int size; - private final Types.NestedField[] transformedFields; - - private int version; - - private transient SortKey sortKey; - - SortKeySerializer(Schema schema, SortOrder sortOrder, int version) { - this.version = version; - this.schema = schema; - this.sortOrder = sortOrder; - this.size = sortOrder.fields().size(); - - this.transformedFields = new Types.NestedField[size]; - for (int i = 0; i < size; ++i) { - SortField sortField = sortOrder.fields().get(i); - Types.NestedField sourceField = schema.findField(sortField.sourceId()); - Type resultType = sortField.transform().getResultType(sourceField.type()); - Types.NestedField transformedField = - Types.NestedField.of( - sourceField.fieldId(), - sourceField.isOptional(), - sourceField.name(), - resultType, - sourceField.doc()); - transformedFields[i] = transformedField; - } - } - - SortKeySerializer(Schema schema, SortOrder sortOrder) { - this(schema, sortOrder, SortKeySerializerSnapshot.CURRENT_VERSION); - } - - private SortKey lazySortKey() { - if (sortKey == null) { - this.sortKey = new SortKey(schema, sortOrder); - } - - return sortKey; - } - - public int getLatestVersion() { - return snapshotConfiguration().getCurrentVersion(); - } - - public void restoreToLatestVersion() { - this.version = snapshotConfiguration().getCurrentVersion(); - } - - public void setVersion(int version) { - this.version = version; - } - - @Override - public boolean isImmutableType() { - return false; - } - - @Override - public TypeSerializer duplicate() { - return new SortKeySerializer(schema, sortOrder); - } - - @Override - public SortKey createInstance() { - return new SortKey(schema, sortOrder); - } - - @Override - public SortKey copy(SortKey from) { - return from.copy(); - } - - @Override - public SortKey copy(SortKey from, SortKey reuse) { - // no benefit of reuse - return copy(from); - } - - @Override - public int getLength() { - return -1; - } - - @Override - public void serialize(SortKey record, DataOutputView target) throws IOException { - Preconditions.checkArgument( - record.size() == size, - "Invalid size of the sort key object: %s. Expected %s", - record.size(), - size); - for (int i = 0; i < size; ++i) { - int fieldId = transformedFields[i].fieldId(); - Type.TypeID typeId = transformedFields[i].type().typeId(); - if (version > 1) { - Object value = record.get(i, Object.class); - if (value == null) { - target.writeBoolean(true); - continue; - } else { - target.writeBoolean(false); - } - } - - switch (typeId) { - case BOOLEAN: - target.writeBoolean(record.get(i, Boolean.class)); - break; - case INTEGER: - case DATE: - target.writeInt(record.get(i, Integer.class)); - break; - case LONG: - case TIME: - case TIMESTAMP: - target.writeLong(record.get(i, Long.class)); - break; - case FLOAT: - target.writeFloat(record.get(i, Float.class)); - break; - case DOUBLE: - target.writeDouble(record.get(i, Double.class)); - break; - case STRING: - target.writeUTF(record.get(i, CharSequence.class).toString()); - break; - case UUID: - UUID uuid = record.get(i, UUID.class); - target.writeLong(uuid.getMostSignificantBits()); - target.writeLong(uuid.getLeastSignificantBits()); - break; - case FIXED: - case BINARY: - byte[] bytes = record.get(i, ByteBuffer.class).array(); - target.writeInt(bytes.length); - target.write(bytes); - break; - case DECIMAL: - BigDecimal decimal = record.get(i, BigDecimal.class); - byte[] decimalBytes = decimal.unscaledValue().toByteArray(); - target.writeInt(decimalBytes.length); - target.write(decimalBytes); - target.writeInt(decimal.scale()); - break; - case STRUCT: - case MAP: - case LIST: - default: - // SortKey transformation is a flattened struct without list and map - throw new UnsupportedOperationException( - String.format( - Locale.ROOT, "Field %d has unsupported field type: %s", fieldId, typeId)); - } - } - } - - @Override - public SortKey deserialize(DataInputView source) throws IOException { - // copying is a little faster than constructing a new SortKey object - SortKey deserialized = lazySortKey().copy(); - deserialize(deserialized, source); - return deserialized; - } - - @Override - public SortKey deserialize(SortKey reuse, DataInputView source) throws IOException { - Preconditions.checkArgument( - reuse.size() == size, - "Invalid size of the sort key object: %s. Expected %s", - reuse.size(), - size); - for (int i = 0; i < size; ++i) { - if (version > 1) { - boolean isNull = source.readBoolean(); - if (isNull) { - reuse.set(i, null); - continue; - } - } - - int fieldId = transformedFields[i].fieldId(); - Type.TypeID typeId = transformedFields[i].type().typeId(); - switch (typeId) { - case BOOLEAN: - reuse.set(i, source.readBoolean()); - break; - case INTEGER: - case DATE: - reuse.set(i, source.readInt()); - break; - case LONG: - case TIME: - case TIMESTAMP: - reuse.set(i, source.readLong()); - break; - case FLOAT: - reuse.set(i, source.readFloat()); - break; - case DOUBLE: - reuse.set(i, source.readDouble()); - break; - case STRING: - reuse.set(i, source.readUTF()); - break; - case UUID: - long mostSignificantBits = source.readLong(); - long leastSignificantBits = source.readLong(); - reuse.set(i, new UUID(mostSignificantBits, leastSignificantBits)); - break; - case FIXED: - case BINARY: - byte[] bytes = new byte[source.readInt()]; - source.read(bytes); - reuse.set(i, ByteBuffer.wrap(bytes)); - break; - case DECIMAL: - byte[] unscaledBytes = new byte[source.readInt()]; - source.read(unscaledBytes); - int scale = source.readInt(); - BigDecimal decimal = new BigDecimal(new BigInteger(unscaledBytes), scale); - reuse.set(i, decimal); - break; - case STRUCT: - case MAP: - case LIST: - default: - // SortKey transformation is a flattened struct without list and map - throw new UnsupportedOperationException( - String.format( - Locale.ROOT, "Field %d has unsupported field type: %s", fieldId, typeId)); - } - } - - return reuse; - } - - @Override - public void copy(DataInputView source, DataOutputView target) throws IOException { - // no optimization here - serialize(deserialize(source), target); - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof SortKeySerializer)) { - return false; - } - - SortKeySerializer other = (SortKeySerializer) obj; - return Objects.equals(schema.asStruct(), other.schema.asStruct()) - && Objects.equals(sortOrder, other.sortOrder); - } - - @Override - public int hashCode() { - return schema.asStruct().hashCode() * 31 + sortOrder.hashCode(); - } - - @Override - public TypeSerializerSnapshot snapshotConfiguration() { - return new SortKeySerializerSnapshot(schema, sortOrder); - } - - public static class SortKeySerializerSnapshot implements TypeSerializerSnapshot { - private static final int CURRENT_VERSION = 2; - - private Schema schema; - private SortOrder sortOrder; - - private int version = CURRENT_VERSION; - - /** Constructor for read instantiation. */ - @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) - public SortKeySerializerSnapshot() { - // this constructor is used when restoring from a checkpoint. - } - - @SuppressWarnings("checkstyle:RedundantModifier") - public SortKeySerializerSnapshot(Schema schema, SortOrder sortOrder) { - this.schema = schema; - this.sortOrder = sortOrder; - } - - @Override - public int getCurrentVersion() { - return CURRENT_VERSION; - } - - @Override - public void writeSnapshot(DataOutputView out) throws IOException { - Preconditions.checkState(schema != null, "Invalid schema: null"); - Preconditions.checkState(sortOrder != null, "Invalid sort order: null"); - - StringUtils.writeString(SchemaParser.toJson(schema), out); - StringUtils.writeString(SortOrderParser.toJson(sortOrder), out); - } - - @Override - public void readSnapshot(int readVersion, DataInputView in, ClassLoader userCodeClassLoader) - throws IOException { - switch (readVersion) { - case 1: - read(in); - this.version = 1; - break; - case 2: - read(in); - break; - default: - throw new IllegalArgumentException("Unknown read version: " + readVersion); - } - } - - @Override - public TypeSerializerSchemaCompatibility resolveSchemaCompatibility( - TypeSerializerSnapshot oldSerializerSnapshot) { - if (!(oldSerializerSnapshot instanceof SortKeySerializerSnapshot)) { - return TypeSerializerSchemaCompatibility.incompatible(); - } - - if (oldSerializerSnapshot.getCurrentVersion() == 1 && this.getCurrentVersion() == 2) { - return TypeSerializerSchemaCompatibility.compatibleAfterMigration(); - } - - // Sort order should be identical - SortKeySerializerSnapshot oldSnapshot = (SortKeySerializerSnapshot) oldSerializerSnapshot; - if (!sortOrder.sameOrder(oldSnapshot.sortOrder)) { - return TypeSerializerSchemaCompatibility.incompatible(); - } - - Set sortFieldIds = - sortOrder.fields().stream().map(SortField::sourceId).collect(Collectors.toSet()); - // only care about the schema related to sort fields - Schema sortSchema = TypeUtil.project(schema, sortFieldIds); - Schema oldSortSchema = TypeUtil.project(oldSnapshot.schema, sortFieldIds); - - List compatibilityErrors = - CheckCompatibility.writeCompatibilityErrors(sortSchema, oldSortSchema); - if (compatibilityErrors.isEmpty()) { - return TypeSerializerSchemaCompatibility.compatibleAsIs(); - } - - return TypeSerializerSchemaCompatibility.incompatible(); - } - - @Override - public TypeSerializer restoreSerializer() { - Preconditions.checkState(schema != null, "Invalid schema: null"); - Preconditions.checkState(sortOrder != null, "Invalid sort order: null"); - return new SortKeySerializer(schema, sortOrder, version); - } - - private void read(DataInputView in) throws IOException { - String schemaJson = StringUtils.readString(in); - String sortOrderJson = StringUtils.readString(in); - this.schema = SchemaParser.fromJson(schemaJson); - this.sortOrder = SortOrderParser.fromJson(sortOrderJson).bind(schema); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java deleted file mode 100644 index d6c23f035015..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.io.IOException; -import java.io.Serializable; -import java.io.UncheckedIOException; -import java.util.Arrays; -import java.util.List; -import org.apache.datasketches.common.ArrayOfItemsSerDe; -import org.apache.datasketches.common.ArrayOfStringsSerDe; -import org.apache.datasketches.common.ByteArrayUtil; -import org.apache.datasketches.common.Util; -import org.apache.datasketches.memory.Memory; -import org.apache.datasketches.sampling.ReservoirItemsSketch; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.api.common.typeutils.base.ListSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * Only way to implement {@link ReservoirItemsSketch} serializer is to extend from {@link - * ArrayOfItemsSerDe}, as deserialization uses a private constructor from ReservoirItemsSketch. The - * implementation is modeled after {@link ArrayOfStringsSerDe} - */ -class SortKeySketchSerializer extends ArrayOfItemsSerDe implements Serializable { - private static final int DEFAULT_SORT_KEY_SIZE = 128; - - private final TypeSerializer itemSerializer; - private final ListSerializer listSerializer; - private final DataInputDeserializer input; - - SortKeySketchSerializer(TypeSerializer itemSerializer) { - this.itemSerializer = itemSerializer; - this.listSerializer = new ListSerializer<>(itemSerializer); - this.input = new DataInputDeserializer(); - } - - @Override - public byte[] serializeToByteArray(SortKey item) { - try { - DataOutputSerializer output = new DataOutputSerializer(DEFAULT_SORT_KEY_SIZE); - itemSerializer.serialize(item, output); - byte[] itemBytes = output.getSharedBuffer(); - int numBytes = output.length(); - byte[] out = new byte[numBytes + Integer.BYTES]; - ByteArrayUtil.copyBytes(itemBytes, 0, out, 4, numBytes); - ByteArrayUtil.putIntLE(out, 0, numBytes); - return out; - } catch (IOException e) { - throw new UncheckedIOException("Failed to serialize sort key", e); - } - } - - @Override - public byte[] serializeToByteArray(SortKey[] items) { - try { - DataOutputSerializer output = new DataOutputSerializer(DEFAULT_SORT_KEY_SIZE * items.length); - listSerializer.serialize(Arrays.asList(items), output); - byte[] itemsBytes = output.getSharedBuffer(); - int numBytes = output.length(); - byte[] out = new byte[Integer.BYTES + numBytes]; - ByteArrayUtil.putIntLE(out, 0, numBytes); - System.arraycopy(itemsBytes, 0, out, Integer.BYTES, numBytes); - return out; - } catch (IOException e) { - throw new UncheckedIOException("Failed to serialize sort key", e); - } - } - - @Override - public SortKey[] deserializeFromMemory(Memory mem, long startingOffset, int numItems) { - Preconditions.checkArgument(mem != null, "Invalid input memory: null"); - if (numItems <= 0) { - return new SortKey[0]; - } - - long offset = startingOffset; - Util.checkBounds(offset, Integer.BYTES, mem.getCapacity()); - int numBytes = mem.getInt(offset); - offset += Integer.BYTES; - - Util.checkBounds(offset, numBytes, mem.getCapacity()); - byte[] sortKeyBytes = new byte[numBytes]; - mem.getByteArray(offset, sortKeyBytes, 0, numBytes); - input.setBuffer(sortKeyBytes); - - try { - List sortKeys = listSerializer.deserialize(input); - SortKey[] array = new SortKey[numItems]; - sortKeys.toArray(array); - input.releaseArrays(); - return array; - } catch (IOException e) { - throw new UncheckedIOException("Failed to deserialize sort key sketch", e); - } - } - - @Override - public int sizeOf(SortKey item) { - return serializeToByteArray(item).length; - } - - @Override - public int sizeOf(Memory mem, long offset, int numItems) { - Preconditions.checkArgument(mem != null, "Invalid input memory: null"); - if (numItems <= 0) { - return 0; - } - - Util.checkBounds(offset, Integer.BYTES, mem.getCapacity()); - int numBytes = mem.getInt(offset); - return Integer.BYTES + numBytes; - } - - @Override - public String toString(SortKey item) { - return item.toString(); - } - - @Override - public Class getClassOfT() { - return SortKey.class; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java deleted file mode 100644 index 1e5bdbbac3e4..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.List; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortField; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; - -class SortKeyUtil { - private SortKeyUtil() {} - - /** Compute the result schema of {@code SortKey} transformation */ - static Schema sortKeySchema(Schema schema, SortOrder sortOrder) { - List sortFields = sortOrder.fields(); - int size = sortFields.size(); - List transformedFields = Lists.newArrayListWithCapacity(size); - for (int i = 0; i < size; ++i) { - int sourceFieldId = sortFields.get(i).sourceId(); - Types.NestedField sourceField = schema.findField(sourceFieldId); - Preconditions.checkArgument( - sourceField != null, "Cannot find source field: %s", sourceFieldId); - Type transformedType = sortFields.get(i).transform().getResultType(sourceField.type()); - // There could be multiple transformations on the same source column, like in the PartitionKey - // case. To resolve the collision, field id is set to transform index and field name is set to - // sourceFieldName_transformIndex - Types.NestedField transformedField = - Types.NestedField.of( - i, - sourceField.isOptional(), - sourceField.name() + '_' + i, - transformedType, - sourceField.doc()); - transformedFields.add(transformedField); - } - - return new Schema(transformedFields); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java deleted file mode 100644 index f6fcdb8b16ef..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.runtime.operators.coordination.OperatorEvent; - -/** - * DataStatisticsEvent is sent between data statistics coordinator and operator to transmit data - * statistics in bytes - */ -@Internal -class StatisticsEvent implements OperatorEvent { - - private static final long serialVersionUID = 1L; - private final long checkpointId; - private final byte[] statisticsBytes; - private final boolean applyImmediately; - - private StatisticsEvent(long checkpointId, byte[] statisticsBytes, boolean applyImmediately) { - this.checkpointId = checkpointId; - this.statisticsBytes = statisticsBytes; - this.applyImmediately = applyImmediately; - } - - static StatisticsEvent createTaskStatisticsEvent( - long checkpointId, - DataStatistics statistics, - TypeSerializer statisticsSerializer) { - // applyImmediately is really only relevant for coordinator to operator event. - // task reported statistics is always merged immediately by the coordinator. - return new StatisticsEvent( - checkpointId, - StatisticsUtil.serializeDataStatistics(statistics, statisticsSerializer), - true); - } - - static StatisticsEvent createGlobalStatisticsEvent( - GlobalStatistics statistics, - TypeSerializer statisticsSerializer, - boolean applyImmediately) { - return new StatisticsEvent( - statistics.checkpointId(), - StatisticsUtil.serializeGlobalStatistics(statistics, statisticsSerializer), - applyImmediately); - } - - long checkpointId() { - return checkpointId; - } - - byte[] statisticsBytes() { - return statisticsBytes; - } - - boolean applyImmediately() { - return applyImmediately; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java deleted file mode 100644 index bc28df2b0e22..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.io.Serializable; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * The wrapper class for data statistics and record. It is the only way for data statistics operator - * to send global data statistics to custom partitioner to distribute data based on statistics - * - *

    DataStatisticsOrRecord contains either data statistics(globally aggregated) or a record. It is - * sent from {@link DataStatisticsOperator} to partitioner. Once partitioner receives the data - * statistics, it will use that to decide the coming record should send to which writer subtask. - * After shuffling, a filter and mapper are required to filter out the data distribution weight, - * unwrap the object and extract the original record type T. - */ -@Internal -public class StatisticsOrRecord implements Serializable { - - private static final long serialVersionUID = 1L; - - private GlobalStatistics statistics; - private RowData record; - - private StatisticsOrRecord(GlobalStatistics statistics, RowData record) { - Preconditions.checkArgument( - record != null ^ statistics != null, "DataStatistics or record, not neither or both"); - this.statistics = statistics; - this.record = record; - } - - static StatisticsOrRecord fromRecord(RowData record) { - return new StatisticsOrRecord(null, record); - } - - static StatisticsOrRecord fromStatistics(GlobalStatistics statistics) { - return new StatisticsOrRecord(statistics, null); - } - - static StatisticsOrRecord reuseRecord( - StatisticsOrRecord reuse, TypeSerializer recordSerializer) { - if (reuse.hasRecord()) { - return reuse; - } else { - // not reusable - return StatisticsOrRecord.fromRecord(recordSerializer.createInstance()); - } - } - - static StatisticsOrRecord reuseStatistics( - StatisticsOrRecord reuse, TypeSerializer statisticsSerializer) { - if (reuse.hasStatistics()) { - return reuse; - } else { - // not reusable - return StatisticsOrRecord.fromStatistics(statisticsSerializer.createInstance()); - } - } - - boolean hasStatistics() { - return statistics != null; - } - - public boolean hasRecord() { - return record != null; - } - - GlobalStatistics statistics() { - return statistics; - } - - void statistics(GlobalStatistics newStatistics) { - this.statistics = newStatistics; - } - - public RowData record() { - return record; - } - - void record(RowData newRecord) { - this.record = newRecord; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("statistics", statistics) - .add("record", record) - .toString(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java deleted file mode 100644 index d4ae2b359679..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java +++ /dev/null @@ -1,208 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.io.IOException; -import java.util.Objects; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; -import org.apache.flink.core.memory.DataInputView; -import org.apache.flink.core.memory.DataOutputView; -import org.apache.flink.table.data.RowData; - -@Internal -class StatisticsOrRecordSerializer extends TypeSerializer { - private final TypeSerializer statisticsSerializer; - private final TypeSerializer recordSerializer; - - StatisticsOrRecordSerializer( - TypeSerializer statisticsSerializer, - TypeSerializer recordSerializer) { - this.statisticsSerializer = statisticsSerializer; - this.recordSerializer = recordSerializer; - } - - @Override - public boolean isImmutableType() { - return false; - } - - @SuppressWarnings("ReferenceEquality") - @Override - public TypeSerializer duplicate() { - TypeSerializer duplicateStatisticsSerializer = - statisticsSerializer.duplicate(); - TypeSerializer duplicateRowDataSerializer = recordSerializer.duplicate(); - if ((statisticsSerializer != duplicateStatisticsSerializer) - || (recordSerializer != duplicateRowDataSerializer)) { - return new StatisticsOrRecordSerializer( - duplicateStatisticsSerializer, duplicateRowDataSerializer); - } else { - return this; - } - } - - @Override - public StatisticsOrRecord createInstance() { - // arbitrarily always create RowData value instance - return StatisticsOrRecord.fromRecord(recordSerializer.createInstance()); - } - - @Override - public StatisticsOrRecord copy(StatisticsOrRecord from) { - if (from.hasRecord()) { - return StatisticsOrRecord.fromRecord(recordSerializer.copy(from.record())); - } else { - return StatisticsOrRecord.fromStatistics(statisticsSerializer.copy(from.statistics())); - } - } - - @Override - public StatisticsOrRecord copy(StatisticsOrRecord from, StatisticsOrRecord reuse) { - StatisticsOrRecord to; - if (from.hasRecord()) { - to = StatisticsOrRecord.reuseRecord(reuse, recordSerializer); - RowData record = recordSerializer.copy(from.record(), to.record()); - to.record(record); - } else { - to = StatisticsOrRecord.reuseStatistics(reuse, statisticsSerializer); - GlobalStatistics statistics = statisticsSerializer.copy(from.statistics(), to.statistics()); - to.statistics(statistics); - } - - return to; - } - - @Override - public int getLength() { - return -1; - } - - @Override - public void serialize(StatisticsOrRecord statisticsOrRecord, DataOutputView target) - throws IOException { - if (statisticsOrRecord.hasRecord()) { - target.writeBoolean(true); - recordSerializer.serialize(statisticsOrRecord.record(), target); - } else { - target.writeBoolean(false); - statisticsSerializer.serialize(statisticsOrRecord.statistics(), target); - } - } - - @Override - public StatisticsOrRecord deserialize(DataInputView source) throws IOException { - boolean isRecord = source.readBoolean(); - if (isRecord) { - return StatisticsOrRecord.fromRecord(recordSerializer.deserialize(source)); - } else { - return StatisticsOrRecord.fromStatistics(statisticsSerializer.deserialize(source)); - } - } - - @Override - public StatisticsOrRecord deserialize(StatisticsOrRecord reuse, DataInputView source) - throws IOException { - StatisticsOrRecord to; - boolean isRecord = source.readBoolean(); - if (isRecord) { - to = StatisticsOrRecord.reuseRecord(reuse, recordSerializer); - RowData record = recordSerializer.deserialize(to.record(), source); - to.record(record); - } else { - to = StatisticsOrRecord.reuseStatistics(reuse, statisticsSerializer); - GlobalStatistics statistics = statisticsSerializer.deserialize(to.statistics(), source); - to.statistics(statistics); - } - - return to; - } - - @Override - public void copy(DataInputView source, DataOutputView target) throws IOException { - boolean hasRecord = source.readBoolean(); - target.writeBoolean(hasRecord); - if (hasRecord) { - recordSerializer.copy(source, target); - } else { - statisticsSerializer.copy(source, target); - } - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof StatisticsOrRecordSerializer)) { - return false; - } - - StatisticsOrRecordSerializer other = (StatisticsOrRecordSerializer) obj; - return Objects.equals(statisticsSerializer, other.statisticsSerializer) - && Objects.equals(recordSerializer, other.recordSerializer); - } - - @Override - public int hashCode() { - return Objects.hash(statisticsSerializer, recordSerializer); - } - - @Override - public TypeSerializerSnapshot snapshotConfiguration() { - return new StatisticsOrRecordSerializerSnapshot(this); - } - - public static class StatisticsOrRecordSerializerSnapshot - extends CompositeTypeSerializerSnapshot { - private static final int CURRENT_VERSION = 1; - - /** Constructor for read instantiation. */ - @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) - public StatisticsOrRecordSerializerSnapshot() {} - - @SuppressWarnings("checkstyle:RedundantModifier") - public StatisticsOrRecordSerializerSnapshot(StatisticsOrRecordSerializer serializer) { - super(serializer); - } - - @SuppressWarnings("checkstyle:RedundantModifier") - @Override - protected int getCurrentOuterSnapshotVersion() { - return CURRENT_VERSION; - } - - @Override - protected TypeSerializer[] getNestedSerializers( - StatisticsOrRecordSerializer outerSerializer) { - return new TypeSerializer[] { - outerSerializer.statisticsSerializer, outerSerializer.recordSerializer - }; - } - - @SuppressWarnings("unchecked") - @Override - protected StatisticsOrRecordSerializer createOuterSerializerWithNestedSerializers( - TypeSerializer[] nestedSerializers) { - TypeSerializer statisticsSerializer = - (TypeSerializer) nestedSerializers[0]; - TypeSerializer recordSerializer = (TypeSerializer) nestedSerializers[1]; - return new StatisticsOrRecordSerializer(statisticsSerializer, recordSerializer); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordTypeInformation.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordTypeInformation.java deleted file mode 100644 index 921ede9466e0..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordTypeInformation.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Objects; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.common.serialization.SerializerConfig; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; - -public class StatisticsOrRecordTypeInformation extends TypeInformation { - - private final TypeInformation rowTypeInformation; - private final SortOrder sortOrder; - private final GlobalStatisticsSerializer globalStatisticsSerializer; - - public StatisticsOrRecordTypeInformation( - RowType flinkRowType, Schema schema, SortOrder sortOrder) { - this.sortOrder = sortOrder; - this.rowTypeInformation = FlinkCompatibilityUtil.toTypeInfo(flinkRowType); - this.globalStatisticsSerializer = - new GlobalStatisticsSerializer(new SortKeySerializer(schema, sortOrder)); - } - - @Override - public boolean isBasicType() { - return false; - } - - @Override - public boolean isTupleType() { - return false; - } - - @Override - public int getArity() { - return 1; - } - - @Override - public int getTotalFields() { - return 1; - } - - @Override - public Class getTypeClass() { - return StatisticsOrRecord.class; - } - - @Override - public boolean isKeyType() { - return false; - } - - @Override - public TypeSerializer createSerializer(SerializerConfig config) { - TypeSerializer recordSerializer = rowTypeInformation.createSerializer(config); - return new StatisticsOrRecordSerializer(globalStatisticsSerializer, recordSerializer); - } - - @Override - public TypeSerializer createSerializer(ExecutionConfig config) { - return createSerializer(config.getSerializerConfig()); - } - - @Override - public String toString() { - return "StatisticsOrRecord"; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } else if (o != null && this.getClass() == o.getClass()) { - StatisticsOrRecordTypeInformation that = (StatisticsOrRecordTypeInformation) o; - return that.sortOrder.equals(sortOrder) - && that.rowTypeInformation.equals(rowTypeInformation) - && that.globalStatisticsSerializer.equals(globalStatisticsSerializer); - } else { - return false; - } - } - - @Override - public int hashCode() { - return Objects.hash(rowTypeInformation, sortOrder, globalStatisticsSerializer); - } - - @Override - public boolean canEqual(Object obj) { - return obj instanceof StatisticsOrRecordTypeInformation; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java deleted file mode 100644 index 43f72e336e06..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -/** - * Range distribution requires gathering statistics on the sort keys to determine proper range - * boundaries to distribute/cluster rows before writer operators. - */ -public enum StatisticsType { - /** - * Tracks the data statistics as {@code Map} frequency. It works better for - * low-cardinality scenarios (like country, event_type, etc.) where the cardinalities are in - * hundreds or thousands. - * - *

      - *
    • Pro: accurate measurement on the statistics/weight of every key. - *
    • Con: memory footprint can be large if the key cardinality is high. - *
    - */ - Map, - - /** - * Sample the sort keys via reservoir sampling. Then split the range partitions via range bounds - * from sampled values. It works better for high-cardinality scenarios (like device_id, user_id, - * uuid etc.) where the cardinalities can be in millions or billions. - * - *
      - *
    • Pro: relatively low memory footprint for high-cardinality sort keys. - *
    • Con: non-precise approximation with potentially lower accuracy. - *
    - */ - Sketch, - - /** - * Initially use Map for statistics tracking. If key cardinality turns out to be high, - * automatically switch to sketch sampling. - */ - Auto -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java deleted file mode 100644 index f2efc7fa9834..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.io.IOException; -import java.io.UncheckedIOException; -import javax.annotation.Nullable; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; - -class StatisticsUtil { - - private StatisticsUtil() {} - - static DataStatistics createTaskStatistics( - StatisticsType type, int operatorParallelism, int numPartitions) { - if (type == StatisticsType.Map) { - return new MapDataStatistics(); - } else { - return new SketchDataStatistics( - SketchUtil.determineOperatorReservoirSize(operatorParallelism, numPartitions)); - } - } - - static byte[] serializeDataStatistics( - DataStatistics dataStatistics, TypeSerializer statisticsSerializer) { - DataOutputSerializer out = new DataOutputSerializer(64); - try { - statisticsSerializer.serialize(dataStatistics, out); - return out.getCopyOfBuffer(); - } catch (IOException e) { - throw new UncheckedIOException("Fail to serialize data statistics", e); - } - } - - static DataStatistics deserializeDataStatistics( - byte[] bytes, TypeSerializer statisticsSerializer) { - DataInputDeserializer input = new DataInputDeserializer(bytes, 0, bytes.length); - try { - return statisticsSerializer.deserialize(input); - } catch (IOException e) { - throw new UncheckedIOException("Fail to deserialize data statistics", e); - } - } - - static byte[] serializeCompletedStatistics( - CompletedStatistics completedStatistics, - TypeSerializer statisticsSerializer) { - try { - DataOutputSerializer out = new DataOutputSerializer(1024); - statisticsSerializer.serialize(completedStatistics, out); - return out.getCopyOfBuffer(); - } catch (IOException e) { - throw new UncheckedIOException("Fail to serialize aggregated statistics", e); - } - } - - static CompletedStatistics deserializeCompletedStatistics( - byte[] bytes, CompletedStatisticsSerializer statisticsSerializer) { - try { - DataInputDeserializer input = new DataInputDeserializer(bytes); - CompletedStatistics completedStatistics = statisticsSerializer.deserialize(input); - if (!completedStatistics.isValid()) { - throw new RuntimeException("Fail to deserialize aggregated statistics,change to v1"); - } - - return completedStatistics; - } catch (Exception e) { - try { - // If we restore from a lower version, the new version of SortKeySerializer cannot correctly - // parse the checkpointData, so we need to first switch the version to v1. Once the state - // data is successfully parsed, we need to switch the serialization version to the latest - // version to parse the subsequent data passed from the TM. - statisticsSerializer.changeSortKeySerializerVersion(1); - DataInputDeserializer input = new DataInputDeserializer(bytes); - CompletedStatistics deserialize = statisticsSerializer.deserialize(input); - statisticsSerializer.changeSortKeySerializerVersionLatest(); - return deserialize; - } catch (IOException ioException) { - throw new UncheckedIOException("Fail to deserialize aggregated statistics", ioException); - } - } - } - - static byte[] serializeGlobalStatistics( - GlobalStatistics globalStatistics, TypeSerializer statisticsSerializer) { - try { - DataOutputSerializer out = new DataOutputSerializer(1024); - statisticsSerializer.serialize(globalStatistics, out); - return out.getCopyOfBuffer(); - } catch (IOException e) { - throw new UncheckedIOException("Fail to serialize aggregated statistics", e); - } - } - - static GlobalStatistics deserializeGlobalStatistics( - byte[] bytes, TypeSerializer statisticsSerializer) { - try { - DataInputDeserializer input = new DataInputDeserializer(bytes); - return statisticsSerializer.deserialize(input); - } catch (IOException e) { - throw new UncheckedIOException("Fail to deserialize aggregated statistics", e); - } - } - - static StatisticsType collectType(StatisticsType config) { - return config == StatisticsType.Sketch ? StatisticsType.Sketch : StatisticsType.Map; - } - - static StatisticsType collectType(StatisticsType config, @Nullable GlobalStatistics statistics) { - if (statistics != null) { - return statistics.type(); - } - - return collectType(config); - } - - static StatisticsType collectType( - StatisticsType config, @Nullable CompletedStatistics statistics) { - if (statistics != null) { - return statistics.type(); - } - - return collectType(config); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java deleted file mode 100644 index 796434c45136..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import org.apache.avro.generic.GenericRecord; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.encryption.InputFilesDecryptor; -import org.apache.iceberg.io.CloseableIterator; - -public class AvroGenericRecordFileScanTaskReader implements FileScanTaskReader { - private final RowDataFileScanTaskReader rowDataReader; - private final RowDataToAvroGenericRecordConverter converter; - - public AvroGenericRecordFileScanTaskReader( - RowDataFileScanTaskReader rowDataReader, RowDataToAvroGenericRecordConverter converter) { - this.rowDataReader = rowDataReader; - this.converter = converter; - } - - @Override - public CloseableIterator open( - FileScanTask fileScanTask, InputFilesDecryptor inputFilesDecryptor) { - return CloseableIterator.transform( - rowDataReader.open(fileScanTask, inputFilesDecryptor), converter); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java deleted file mode 100644 index 3beda960cec8..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.Iterator; -import java.util.Locale; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.encryption.InputFilesDecryptor; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * Flink data iterator that reads {@link CombinedScanTask} into a {@link CloseableIterator} - * - * @param is the output data type returned by this iterator. - */ -@Internal -public class DataIterator implements CloseableIterator { - - private final FileScanTaskReader fileScanTaskReader; - - private final InputFilesDecryptor inputFilesDecryptor; - private final CombinedScanTask combinedTask; - - private Iterator tasks; - private CloseableIterator currentIterator; - private int fileOffset; - private long recordOffset; - - public DataIterator( - FileScanTaskReader fileScanTaskReader, - CombinedScanTask task, - FileIO io, - EncryptionManager encryption) { - this.fileScanTaskReader = fileScanTaskReader; - - this.inputFilesDecryptor = new InputFilesDecryptor(task, io, encryption); - this.combinedTask = task; - - this.tasks = task.files().iterator(); - this.currentIterator = CloseableIterator.empty(); - - // fileOffset starts at -1 because we started - // from an empty iterator that is not from the split files. - this.fileOffset = -1; - // record offset points to the record that next() should return when called - this.recordOffset = 0L; - } - - /** - * (startingFileOffset, startingRecordOffset) points to the next row that reader should resume - * from. E.g., if the seek position is (file=0, record=1), seek moves the iterator position to the - * 2nd row in file 0. When next() is called after seek, 2nd row from file 0 should be returned. - */ - public void seek(int startingFileOffset, long startingRecordOffset) { - Preconditions.checkState( - fileOffset == -1, "Seek should be called before any other iterator actions"); - // skip files - Preconditions.checkState( - startingFileOffset < combinedTask.files().size(), - "Invalid starting file offset %s for combined scan task with %s files: %s", - startingFileOffset, - combinedTask.files().size(), - combinedTask); - for (long i = 0L; i < startingFileOffset; ++i) { - tasks.next(); - } - - updateCurrentIterator(); - // skip records within the file - for (long i = 0; i < startingRecordOffset; ++i) { - if (currentFileHasNext() && hasNext()) { - next(); - } else { - throw new IllegalStateException( - String.format( - Locale.ROOT, - "Invalid starting record offset %d for file %d from CombinedScanTask: %s", - startingRecordOffset, - startingFileOffset, - combinedTask)); - } - } - - fileOffset = startingFileOffset; - recordOffset = startingRecordOffset; - } - - @Override - public boolean hasNext() { - updateCurrentIterator(); - return currentIterator.hasNext(); - } - - @Override - public T next() { - updateCurrentIterator(); - recordOffset += 1; - return currentIterator.next(); - } - - public boolean currentFileHasNext() { - return currentIterator.hasNext(); - } - - /** Updates the current iterator field to ensure that the current Iterator is not exhausted. */ - private void updateCurrentIterator() { - try { - while (!currentIterator.hasNext() && tasks.hasNext()) { - currentIterator.close(); - currentIterator = openTaskIterator(tasks.next()); - fileOffset += 1; - recordOffset = 0L; - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - private CloseableIterator openTaskIterator(FileScanTask scanTask) { - return fileScanTaskReader.open(scanTask, inputFilesDecryptor); - } - - @Override - public void close() throws IOException { - // close the current iterator - currentIterator.close(); - tasks = null; - } - - public int fileOffset() { - return fileOffset; - } - - public long recordOffset() { - return recordOffset; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java deleted file mode 100644 index 4394dab4d4cc..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.Schema; -import org.apache.iceberg.encryption.InputFilesDecryptor; -import org.apache.iceberg.flink.data.StructRowData; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.CloseableIterator; - -@Internal -public class DataTaskReader implements FileScanTaskReader { - - private final Schema readSchema; - - public DataTaskReader(Schema readSchema) { - this.readSchema = readSchema; - } - - @Override - public CloseableIterator open( - FileScanTask task, InputFilesDecryptor inputFilesDecryptor) { - StructRowData row = new StructRowData(readSchema.asStruct()); - CloseableIterable iterable = - CloseableIterable.transform(task.asDataTask().rows(), row::setStruct); - return iterable.iterator(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java deleted file mode 100644 index 927a804a4792..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.Serializable; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.encryption.InputFilesDecryptor; -import org.apache.iceberg.io.CloseableIterator; - -/** - * Read a {@link FileScanTask} into a {@link CloseableIterator} - * - * @param is the output data type returned by this iterator. - */ -@Internal -public interface FileScanTaskReader extends Serializable { - CloseableIterator open(FileScanTask fileScanTask, InputFilesDecryptor inputFilesDecryptor); -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java deleted file mode 100644 index a68f0e50e0d0..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.concurrent.ExecutorService; -import org.apache.flink.api.common.io.DefaultInputSplitAssigner; -import org.apache.flink.api.common.io.InputFormat; -import org.apache.flink.api.common.io.LocatableInputSplitAssigner; -import org.apache.flink.api.common.io.RichInputFormat; -import org.apache.flink.api.common.io.statistics.BaseStatistics; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.core.io.InputSplitAssigner; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.BaseMetadataTable; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.util.ThreadPools; - -/** Flink {@link InputFormat} for Iceberg. */ -public class FlinkInputFormat extends RichInputFormat { - - private static final long serialVersionUID = 1L; - - private final TableLoader tableLoader; - private final FileIO io; - private final EncryptionManager encryption; - private final ScanContext context; - private final FileScanTaskReader rowDataReader; - - private transient DataIterator iterator; - private transient long currentReadCount = 0L; - - FlinkInputFormat( - TableLoader tableLoader, - Schema tableSchema, - FileIO io, - EncryptionManager encryption, - ScanContext context) { - this.tableLoader = tableLoader; - this.io = io; - this.encryption = encryption; - this.context = context; - - tableLoader.open(); - Table table = tableLoader.loadTable(); - if (table instanceof BaseMetadataTable) { - this.rowDataReader = new DataTaskReader(context.project()); - } else { - this.rowDataReader = - new RowDataFileScanTaskReader( - tableSchema, - context.project(), - context.nameMapping(), - context.caseSensitive(), - context.filters()); - } - } - - @VisibleForTesting - Schema projectedSchema() { - return context.project(); - } - - @Override - public BaseStatistics getStatistics(BaseStatistics cachedStatistics) { - // Legacy method, not be used. - return null; - } - - @Override - public FlinkInputSplit[] createInputSplits(int minNumSplits) throws IOException { - // Called in Job manager, so it is OK to load table from catalog. - tableLoader.open(); - final ExecutorService workerPool = - ThreadPools.newFixedThreadPool("iceberg-plan-worker-pool", context.planParallelism()); - try (TableLoader loader = tableLoader) { - Table table = loader.loadTable(); - return FlinkSplitPlanner.planInputSplits(table, context, workerPool); - } finally { - workerPool.shutdown(); - } - } - - @Override - public InputSplitAssigner getInputSplitAssigner(FlinkInputSplit[] inputSplits) { - return context.exposeLocality() - ? new LocatableInputSplitAssigner(inputSplits) - : new DefaultInputSplitAssigner(inputSplits); - } - - @Override - public void configure(Configuration parameters) {} - - @Override - public void open(FlinkInputSplit split) { - this.iterator = new DataIterator<>(rowDataReader, split.getTask(), io, encryption); - } - - @Override - public boolean reachedEnd() { - if (context.limit() > 0 && currentReadCount >= context.limit()) { - return true; - } else { - return !iterator.hasNext(); - } - } - - @Override - public RowData nextRecord(RowData reuse) { - currentReadCount++; - return iterator.next(); - } - - @Override - public void close() throws IOException { - if (iterator != null) { - iterator.close(); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java deleted file mode 100644 index 16fd4f39596c..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.Arrays; -import javax.annotation.Nullable; -import org.apache.flink.core.io.LocatableInputSplit; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; - -public class FlinkInputSplit extends LocatableInputSplit { - - private final CombinedScanTask task; - - FlinkInputSplit(int splitNumber, CombinedScanTask task, @Nullable String[] hostnames) { - super(splitNumber, hostnames); - this.task = task; - } - - CombinedScanTask getTask() { - return task; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("splitNumber", getSplitNumber()) - .add("task", task) - .add("hosts", Arrays.toString(getHostnames())) - .toString(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java deleted file mode 100644 index b5a1ba85a6cb..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java +++ /dev/null @@ -1,318 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.time.Duration; -import java.util.List; -import java.util.Map; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.functions.source.SourceFunction; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.TableScan; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.FlinkReadOptions; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.PropertyUtil; - -/** - * Flink source builder for old {@link SourceFunction} implementation. - * - * @deprecated since 1.7.0, will be removed in 2.0.0. Use {@link IcebergSource} instead, which - * implement the newer FLIP-27 source interface. This class implements the old {@link - * SourceFunction} that has been marked as deprecated in Flink since Aug 2023. - */ -@Deprecated -public class FlinkSource { - private FlinkSource() {} - - /** - * Initialize a {@link Builder} to read the data from iceberg table. Equivalent to {@link - * TableScan}. See more options in {@link ScanContext}. - * - *

    The Source can be read static data in bounded mode. It can also continuously check the - * arrival of new data and read records incrementally. - * - *

      - *
    • Without startSnapshotId: Bounded - *
    • With startSnapshotId and with endSnapshotId: Bounded - *
    • With startSnapshotId (-1 means unbounded preceding) and Without endSnapshotId: Unbounded - *
    - * - *

    - * - * @return {@link Builder} to connect the iceberg table. - */ - public static Builder forRowData() { - return new Builder(); - } - - /** Source builder to build {@link DataStream}. */ - public static class Builder { - private StreamExecutionEnvironment env; - private Table table; - private TableLoader tableLoader; - private TableSchema projectedSchema; - private ReadableConfig readableConfig = new Configuration(); - private final ScanContext.Builder contextBuilder = ScanContext.builder(); - private Boolean exposeLocality; - - private final Map readOptions = Maps.newHashMap(); - - public Builder tableLoader(TableLoader newLoader) { - this.tableLoader = newLoader; - return this; - } - - public Builder table(Table newTable) { - this.table = newTable; - return this; - } - - public Builder env(StreamExecutionEnvironment newEnv) { - this.env = newEnv; - return this; - } - - public Builder filters(List filters) { - contextBuilder.filters(filters); - return this; - } - - public Builder project(TableSchema schema) { - this.projectedSchema = schema; - return this; - } - - public Builder limit(Long newLimit) { - if (newLimit != null) { - readOptions.put(FlinkReadOptions.LIMIT, Long.toString(newLimit)); - } - return this; - } - - public Builder set(String property, String value) { - readOptions.put(property, value); - return this; - } - - public Builder setAll(Map properties) { - readOptions.putAll(properties); - return this; - } - - /** - * @deprecated Use {@link #setAll} instead. - */ - @Deprecated - public Builder properties(Map properties) { - readOptions.putAll(properties); - return this; - } - - public Builder caseSensitive(boolean caseSensitive) { - readOptions.put(FlinkReadOptions.CASE_SENSITIVE, Boolean.toString(caseSensitive)); - return this; - } - - public Builder snapshotId(Long snapshotId) { - readOptions.put(FlinkReadOptions.SNAPSHOT_ID.key(), Long.toString(snapshotId)); - return this; - } - - public Builder branch(String branch) { - readOptions.put(FlinkReadOptions.BRANCH.key(), branch); - return this; - } - - public Builder tag(String tag) { - readOptions.put(FlinkReadOptions.TAG.key(), tag); - return this; - } - - public Builder startSnapshotId(Long startSnapshotId) { - readOptions.put(FlinkReadOptions.START_SNAPSHOT_ID.key(), Long.toString(startSnapshotId)); - return this; - } - - public Builder endSnapshotId(Long endSnapshotId) { - readOptions.put(FlinkReadOptions.END_SNAPSHOT_ID.key(), Long.toString(endSnapshotId)); - return this; - } - - public Builder startTag(String startTag) { - readOptions.put(FlinkReadOptions.START_TAG.key(), startTag); - return this; - } - - public Builder endTag(String endTag) { - readOptions.put(FlinkReadOptions.END_TAG.key(), endTag); - return this; - } - - public Builder asOfTimestamp(Long asOfTimestamp) { - readOptions.put(FlinkReadOptions.AS_OF_TIMESTAMP.key(), Long.toString(asOfTimestamp)); - return this; - } - - public Builder splitSize(Long splitSize) { - readOptions.put(FlinkReadOptions.SPLIT_SIZE, Long.toString(splitSize)); - return this; - } - - public Builder splitLookback(Integer splitLookback) { - readOptions.put(FlinkReadOptions.SPLIT_LOOKBACK, Integer.toString(splitLookback)); - return this; - } - - public Builder splitOpenFileCost(Long splitOpenFileCost) { - readOptions.put(FlinkReadOptions.SPLIT_FILE_OPEN_COST, Long.toString(splitOpenFileCost)); - return this; - } - - public Builder streaming(boolean streaming) { - readOptions.put(FlinkReadOptions.STREAMING, Boolean.toString(streaming)); - return this; - } - - public Builder exposeLocality(boolean newExposeLocality) { - this.exposeLocality = newExposeLocality; - return this; - } - - public Builder nameMapping(String nameMapping) { - readOptions.put(TableProperties.DEFAULT_NAME_MAPPING, nameMapping); - return this; - } - - public Builder monitorInterval(Duration interval) { - readOptions.put(FlinkReadOptions.MONITOR_INTERVAL, interval.toNanos() + " ns"); - return this; - } - - public Builder maxPlanningSnapshotCount(int newMaxPlanningSnapshotCount) { - readOptions.put( - FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT, - Integer.toString(newMaxPlanningSnapshotCount)); - return this; - } - - public Builder flinkConf(ReadableConfig config) { - this.readableConfig = config; - return this; - } - - public FlinkInputFormat buildFormat() { - Preconditions.checkNotNull(tableLoader, "TableLoader should not be null"); - - Schema icebergSchema; - FileIO io; - EncryptionManager encryption; - if (table == null) { - // load required fields by table loader. - tableLoader.open(); - try (TableLoader loader = tableLoader) { - table = loader.loadTable(); - icebergSchema = table.schema(); - io = table.io(); - encryption = table.encryption(); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } else { - icebergSchema = table.schema(); - io = table.io(); - encryption = table.encryption(); - } - - if (projectedSchema == null) { - contextBuilder.project(icebergSchema); - } else { - contextBuilder.project(FlinkSchemaUtil.convert(icebergSchema, projectedSchema)); - } - - contextBuilder.exposeLocality( - SourceUtil.isLocalityEnabled(table, readableConfig, exposeLocality)); - contextBuilder.planParallelism( - readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE)); - - contextBuilder.resolveConfig(table, readOptions, readableConfig); - - ScanContext context = contextBuilder.build(); - context.validate(); - return new FlinkInputFormat(tableLoader, icebergSchema, io, encryption, context); - } - - public DataStream build() { - Preconditions.checkNotNull(env, "StreamExecutionEnvironment should not be null"); - FlinkInputFormat format = buildFormat(); - - ScanContext context = contextBuilder.build(); - TypeInformation typeInfo = - FlinkCompatibilityUtil.toTypeInfo(FlinkSchemaUtil.convert(context.project())); - - if (!context.isStreaming()) { - int parallelism = - SourceUtil.inferParallelism( - readableConfig, - context.limit(), - () -> { - try { - return format.createInputSplits(0).length; - } catch (IOException e) { - throw new UncheckedIOException( - "Failed to create iceberg input splits for table: " + table, e); - } - }); - if (env.getMaxParallelism() > 0) { - parallelism = Math.min(parallelism, env.getMaxParallelism()); - } - return env.createInput(format, typeInfo).setParallelism(parallelism); - } else { - StreamingMonitorFunction function = new StreamingMonitorFunction(tableLoader, context); - - String monitorFunctionName = String.format("Iceberg table (%s) monitor", table); - String readerOperatorName = String.format("Iceberg table (%s) reader", table); - - return env.addSource(function, monitorFunctionName) - .transform(readerOperatorName, typeInfo, StreamingReaderOperator.factory(format)); - } - } - } - - public static boolean isBounded(Map properties) { - return !PropertyUtil.propertyAsBoolean(properties, FlinkReadOptions.STREAMING, false); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java deleted file mode 100644 index 15078809714f..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.List; -import java.util.concurrent.ExecutorService; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.IncrementalAppendScan; -import org.apache.iceberg.Scan; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.TableScan; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.hadoop.Util; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.Tasks; - -@Internal -public class FlinkSplitPlanner { - private FlinkSplitPlanner() {} - - static FlinkInputSplit[] planInputSplits( - Table table, ScanContext context, ExecutorService workerPool) { - try (CloseableIterable tasksIterable = - planTasks(table, context, workerPool)) { - List tasks = Lists.newArrayList(tasksIterable); - FlinkInputSplit[] splits = new FlinkInputSplit[tasks.size()]; - boolean exposeLocality = context.exposeLocality(); - - Tasks.range(tasks.size()) - .stopOnFailure() - .executeWith(exposeLocality ? workerPool : null) - .run( - index -> { - CombinedScanTask task = tasks.get(index); - String[] hostnames = null; - if (exposeLocality) { - hostnames = Util.blockLocations(table.io(), task); - } - splits[index] = new FlinkInputSplit(index, task, hostnames); - }); - return splits; - } catch (IOException e) { - throw new UncheckedIOException("Failed to process tasks iterable", e); - } - } - - /** This returns splits for the FLIP-27 source */ - public static List planIcebergSourceSplits( - Table table, ScanContext context, ExecutorService workerPool) { - try (CloseableIterable tasksIterable = - planTasks(table, context, workerPool)) { - return Lists.newArrayList( - CloseableIterable.transform(tasksIterable, IcebergSourceSplit::fromCombinedScanTask)); - } catch (IOException e) { - throw new UncheckedIOException("Failed to process task iterable: ", e); - } - } - - static CloseableIterable planTasks( - Table table, ScanContext context, ExecutorService workerPool) { - ScanMode scanMode = checkScanMode(context); - if (scanMode == ScanMode.INCREMENTAL_APPEND_SCAN) { - IncrementalAppendScan scan = table.newIncrementalAppendScan(); - scan = refineScanWithBaseConfigs(scan, context, workerPool); - - if (context.startTag() != null) { - Preconditions.checkArgument( - table.snapshot(context.startTag()) != null, - "Cannot find snapshot with tag %s", - context.startTag()); - scan = scan.fromSnapshotExclusive(table.snapshot(context.startTag()).snapshotId()); - } - - if (context.startSnapshotId() != null) { - Preconditions.checkArgument( - context.startTag() == null, "START_SNAPSHOT_ID and START_TAG cannot both be set"); - scan = scan.fromSnapshotExclusive(context.startSnapshotId()); - } - - if (context.endTag() != null) { - Preconditions.checkArgument( - table.snapshot(context.endTag()) != null, - "Cannot find snapshot with tag %s", - context.endTag()); - scan = scan.toSnapshot(table.snapshot(context.endTag()).snapshotId()); - } - - if (context.endSnapshotId() != null) { - Preconditions.checkArgument( - context.endTag() == null, "END_SNAPSHOT_ID and END_TAG cannot both be set"); - scan = scan.toSnapshot(context.endSnapshotId()); - } - - return scan.planTasks(); - } else { - TableScan scan = table.newScan(); - scan = refineScanWithBaseConfigs(scan, context, workerPool); - - if (context.snapshotId() != null) { - scan = scan.useSnapshot(context.snapshotId()); - } else if (context.tag() != null) { - scan = scan.useRef(context.tag()); - } else if (context.branch() != null) { - scan = scan.useRef(context.branch()); - } - - if (context.asOfTimestamp() != null) { - scan = scan.asOfTime(context.asOfTimestamp()); - } - - return scan.planTasks(); - } - } - - @VisibleForTesting - enum ScanMode { - BATCH, - INCREMENTAL_APPEND_SCAN - } - - @VisibleForTesting - static ScanMode checkScanMode(ScanContext context) { - if (context.startSnapshotId() != null - || context.endSnapshotId() != null - || context.startTag() != null - || context.endTag() != null) { - return ScanMode.INCREMENTAL_APPEND_SCAN; - } else { - return ScanMode.BATCH; - } - } - - /** refine scan with common configs */ - private static > T refineScanWithBaseConfigs( - T scan, ScanContext context, ExecutorService workerPool) { - T refinedScan = - scan.caseSensitive(context.caseSensitive()).project(context.project()).planWith(workerPool); - - if (context.includeColumnStats()) { - refinedScan = refinedScan.includeColumnStats(); - } - - if (context.includeStatsForColumns() != null) { - refinedScan = refinedScan.includeColumnStats(context.includeStatsForColumns()); - } - - refinedScan = refinedScan.option(TableProperties.SPLIT_SIZE, context.splitSize().toString()); - - refinedScan = - refinedScan.option(TableProperties.SPLIT_LOOKBACK, context.splitLookback().toString()); - - refinedScan = - refinedScan.option( - TableProperties.SPLIT_OPEN_FILE_COST, context.splitOpenFileCost().toString()); - - if (context.filters() != null) { - for (Expression filter : context.filters()) { - refinedScan = refinedScan.filter(filter); - } - } - - return refinedScan; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java deleted file mode 100644 index 035682be8296..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java +++ /dev/null @@ -1,702 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.time.Duration; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.TimeUnit; -import javax.annotation.Nullable; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.connector.source.Boundedness; -import org.apache.flink.api.connector.source.Source; -import org.apache.flink.api.connector.source.SourceReader; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.api.connector.source.SplitEnumerator; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.DataStreamSource; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.BaseMetadataTable; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.FlinkReadConf; -import org.apache.iceberg.flink.FlinkReadOptions; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.source.assigner.OrderedSplitAssignerFactory; -import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; -import org.apache.iceberg.flink.source.assigner.SplitAssigner; -import org.apache.iceberg.flink.source.assigner.SplitAssignerFactory; -import org.apache.iceberg.flink.source.enumerator.ContinuousIcebergEnumerator; -import org.apache.iceberg.flink.source.enumerator.ContinuousSplitPlanner; -import org.apache.iceberg.flink.source.enumerator.ContinuousSplitPlannerImpl; -import org.apache.iceberg.flink.source.enumerator.IcebergEnumeratorState; -import org.apache.iceberg.flink.source.enumerator.IcebergEnumeratorStateSerializer; -import org.apache.iceberg.flink.source.enumerator.StaticIcebergEnumerator; -import org.apache.iceberg.flink.source.reader.ColumnStatsWatermarkExtractor; -import org.apache.iceberg.flink.source.reader.ConverterReaderFunction; -import org.apache.iceberg.flink.source.reader.IcebergSourceReader; -import org.apache.iceberg.flink.source.reader.IcebergSourceReaderMetrics; -import org.apache.iceberg.flink.source.reader.MetaDataReaderFunction; -import org.apache.iceberg.flink.source.reader.ReaderFunction; -import org.apache.iceberg.flink.source.reader.RowDataConverter; -import org.apache.iceberg.flink.source.reader.RowDataReaderFunction; -import org.apache.iceberg.flink.source.reader.SerializableRecordEmitter; -import org.apache.iceberg.flink.source.reader.SplitWatermarkExtractor; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitSerializer; -import org.apache.iceberg.flink.source.split.SerializableComparator; -import org.apache.iceberg.flink.source.split.SplitComparators; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.util.ThreadPools; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class IcebergSource implements Source { - private static final Logger LOG = LoggerFactory.getLogger(IcebergSource.class); - - // This table loader can be closed, and it is only safe to use this instance for resource - // independent information (e.g. a table name). Copies of this are required to avoid lifecycle - // management conflicts with the user provided table loader. e.g. a copy of this is required for - // split planning, which uses the underlying io, and should be closed after split planning is - // complete. - private final TableLoader tableLoader; - private final ScanContext scanContext; - private final ReaderFunction readerFunction; - private final SplitAssignerFactory assignerFactory; - private final SerializableComparator splitComparator; - private final SerializableRecordEmitter emitter; - private final String tableName; - - // cache the discovered splits by planSplitsForBatch, which can be called twice. And they come - // from two different threads: (1) source/stream construction by main thread (2) enumerator - // creation. Hence need volatile here. - private volatile List batchSplits; - - IcebergSource( - TableLoader tableLoader, - ScanContext scanContext, - ReaderFunction readerFunction, - SplitAssignerFactory assignerFactory, - SerializableComparator splitComparator, - Table table, - SerializableRecordEmitter emitter) { - Preconditions.checkNotNull(tableLoader, "tableLoader is required."); - Preconditions.checkNotNull(readerFunction, "readerFunction is required."); - Preconditions.checkNotNull(assignerFactory, "assignerFactory is required."); - Preconditions.checkNotNull(table, "table is required."); - this.tableLoader = tableLoader; - this.scanContext = scanContext; - this.readerFunction = readerFunction; - this.assignerFactory = assignerFactory; - this.splitComparator = splitComparator; - this.emitter = emitter; - this.tableName = table.name(); - } - - String name() { - return "IcebergSource-" + tableName; - } - - private String planningThreadName() { - // Ideally, operatorId should be used as the threadPoolName as Flink guarantees its uniqueness - // within a job. SplitEnumeratorContext doesn't expose the OperatorCoordinator.Context, which - // would contain the OperatorID. Need to discuss with Flink community whether it is ok to expose - // a public API like the protected method "OperatorCoordinator.Context getCoordinatorContext()" - // from SourceCoordinatorContext implementation. For now,

  • - is used as - // the unique thread pool name. - return tableName + "-" + UUID.randomUUID(); - } - - /** - * Cache the enumerated splits for batch execution to avoid double planning as there are two code - * paths obtaining splits: (1) infer parallelism (2) enumerator creation. - */ - private List planSplitsForBatch(String threadName) { - if (batchSplits != null) { - return batchSplits; - } - - ExecutorService workerPool = - ThreadPools.newFixedThreadPool(threadName, scanContext.planParallelism()); - try (TableLoader loader = tableLoader.clone()) { - loader.open(); - this.batchSplits = - FlinkSplitPlanner.planIcebergSourceSplits(loader.loadTable(), scanContext, workerPool); - LOG.info( - "Discovered {} splits from table {} during job initialization", - batchSplits.size(), - tableName); - return batchSplits; - } catch (IOException e) { - throw new UncheckedIOException("Failed to close table loader", e); - } finally { - workerPool.shutdown(); - } - } - - @Override - public Boundedness getBoundedness() { - return scanContext.isStreaming() ? Boundedness.CONTINUOUS_UNBOUNDED : Boundedness.BOUNDED; - } - - @Override - public SourceReader createReader(SourceReaderContext readerContext) { - IcebergSourceReaderMetrics metrics = - new IcebergSourceReaderMetrics(readerContext.metricGroup(), tableName); - return new IcebergSourceReader<>( - emitter, metrics, readerFunction, splitComparator, readerContext); - } - - @Override - public SplitEnumerator createEnumerator( - SplitEnumeratorContext enumContext) { - return createEnumerator(enumContext, null); - } - - @Override - public SplitEnumerator restoreEnumerator( - SplitEnumeratorContext enumContext, IcebergEnumeratorState enumState) { - return createEnumerator(enumContext, enumState); - } - - @Override - public SimpleVersionedSerializer getSplitSerializer() { - return new IcebergSourceSplitSerializer(scanContext.caseSensitive()); - } - - @Override - public SimpleVersionedSerializer getEnumeratorCheckpointSerializer() { - return new IcebergEnumeratorStateSerializer(scanContext.caseSensitive()); - } - - private SplitEnumerator createEnumerator( - SplitEnumeratorContext enumContext, - @Nullable IcebergEnumeratorState enumState) { - SplitAssigner assigner; - if (enumState == null) { - assigner = assignerFactory.createAssigner(); - } else { - LOG.info( - "Iceberg source restored {} splits from state for table {}", - enumState.pendingSplits().size(), - tableName); - assigner = assignerFactory.createAssigner(enumState.pendingSplits()); - } - if (scanContext.isStreaming()) { - ContinuousSplitPlanner splitPlanner = - new ContinuousSplitPlannerImpl(tableLoader, scanContext, planningThreadName()); - return new ContinuousIcebergEnumerator( - enumContext, assigner, scanContext, splitPlanner, enumState); - } else { - if (enumState == null) { - // Only do scan planning if nothing is restored from checkpoint state - List splits = planSplitsForBatch(planningThreadName()); - assigner.onDiscoveredSplits(splits); - // clear the cached splits after enumerator creation as they won't be needed anymore - this.batchSplits = null; - } - - return new StaticIcebergEnumerator(enumContext, assigner); - } - } - - private boolean shouldInferParallelism() { - return !scanContext.isStreaming(); - } - - private int inferParallelism(ReadableConfig flinkConf, StreamExecutionEnvironment env) { - int parallelism = - SourceUtil.inferParallelism( - flinkConf, - scanContext.limit(), - () -> { - List splits = planSplitsForBatch(planningThreadName()); - return splits.size(); - }); - - if (env.getMaxParallelism() > 0) { - parallelism = Math.min(parallelism, env.getMaxParallelism()); - } - - return parallelism; - } - - /** - * Create a source builder. - * - * @deprecated since 1.7.0. Will be removed in 2.0.0; use{@link IcebergSource#forRowData()} or - * {@link IcebergSource#forOutputType(RowDataConverter)} instead - */ - @Deprecated - public static Builder builder() { - return new Builder<>(); - } - - /** Create a source builder for RowData output type. */ - public static Builder forRowData() { - return new Builder<>(); - } - - /** - * Create a source builder that would convert {@link RowData} to the output type {@code T}. - * - * @param converter convert {@link RowData} to output type {@code T} - * @param output type - * @return an IcebergSource builder - */ - public static Builder forOutputType(RowDataConverter converter) { - return new Builder().converter(converter); - } - - public static class Builder { - private TableLoader tableLoader; - private Table table; - private SplitAssignerFactory splitAssignerFactory; - private SerializableComparator splitComparator; - private ReaderFunction readerFunction; - private RowDataConverter converter; - private ReadableConfig flinkConfig = new Configuration(); - private final ScanContext.Builder contextBuilder = ScanContext.builder(); - private TableSchema projectedTableSchema; - private ResolvedSchema projectedFlinkSchema; - private Boolean exposeLocality; - - private final Map readOptions = Maps.newHashMap(); - - Builder() {} - - public Builder tableLoader(TableLoader loader) { - this.tableLoader = loader; - return this; - } - - public Builder table(Table newTable) { - this.table = newTable; - return this; - } - - public Builder assignerFactory(SplitAssignerFactory assignerFactory) { - this.splitAssignerFactory = assignerFactory; - return this; - } - - public Builder splitComparator( - SerializableComparator newSplitComparator) { - this.splitComparator = newSplitComparator; - return this; - } - - /** - * @deprecated since 1.7.0. Will be removed in 2.0.0; use{@link - * IcebergSource#forOutputType(RowDataConverter)} instead to produce output type other than - * {@link RowData}. - */ - @Deprecated - public Builder readerFunction(ReaderFunction newReaderFunction) { - Preconditions.checkState( - converter == null, - "Cannot set reader function when builder was created via IcebergSource.forOutputType(Converter)"); - this.readerFunction = newReaderFunction; - return this; - } - - /** - * Don't need to be public. It is set by {@link IcebergSource#forOutputType(RowDataConverter)}. - */ - private Builder converter(RowDataConverter newConverter) { - this.converter = newConverter; - return this; - } - - public Builder flinkConfig(ReadableConfig config) { - this.flinkConfig = config; - return this; - } - - public Builder caseSensitive(boolean newCaseSensitive) { - readOptions.put(FlinkReadOptions.CASE_SENSITIVE, Boolean.toString(newCaseSensitive)); - return this; - } - - public Builder useSnapshotId(Long newSnapshotId) { - if (newSnapshotId != null) { - readOptions.put(FlinkReadOptions.SNAPSHOT_ID.key(), Long.toString(newSnapshotId)); - } - return this; - } - - public Builder streamingStartingStrategy(StreamingStartingStrategy newStartingStrategy) { - readOptions.put(FlinkReadOptions.STARTING_STRATEGY, newStartingStrategy.name()); - return this; - } - - public Builder startSnapshotTimestamp(Long newStartSnapshotTimestamp) { - if (newStartSnapshotTimestamp != null) { - readOptions.put( - FlinkReadOptions.START_SNAPSHOT_TIMESTAMP.key(), - Long.toString(newStartSnapshotTimestamp)); - } - return this; - } - - public Builder startSnapshotId(Long newStartSnapshotId) { - if (newStartSnapshotId != null) { - readOptions.put( - FlinkReadOptions.START_SNAPSHOT_ID.key(), Long.toString(newStartSnapshotId)); - } - return this; - } - - public Builder tag(String tag) { - readOptions.put(FlinkReadOptions.TAG.key(), tag); - return this; - } - - public Builder branch(String branch) { - readOptions.put(FlinkReadOptions.BRANCH.key(), branch); - return this; - } - - public Builder startTag(String startTag) { - readOptions.put(FlinkReadOptions.START_TAG.key(), startTag); - return this; - } - - public Builder endTag(String endTag) { - readOptions.put(FlinkReadOptions.END_TAG.key(), endTag); - return this; - } - - public Builder endSnapshotId(Long newEndSnapshotId) { - if (newEndSnapshotId != null) { - readOptions.put(FlinkReadOptions.END_SNAPSHOT_ID.key(), Long.toString(newEndSnapshotId)); - } - return this; - } - - public Builder asOfTimestamp(Long newAsOfTimestamp) { - if (newAsOfTimestamp != null) { - readOptions.put(FlinkReadOptions.AS_OF_TIMESTAMP.key(), Long.toString(newAsOfTimestamp)); - } - return this; - } - - public Builder splitSize(Long newSplitSize) { - if (newSplitSize != null) { - readOptions.put(FlinkReadOptions.SPLIT_SIZE, Long.toString(newSplitSize)); - } - return this; - } - - public Builder splitLookback(Integer newSplitLookback) { - if (newSplitLookback != null) { - readOptions.put(FlinkReadOptions.SPLIT_LOOKBACK, Integer.toString(newSplitLookback)); - } - return this; - } - - public Builder splitOpenFileCost(Long newSplitOpenFileCost) { - if (newSplitOpenFileCost != null) { - readOptions.put(FlinkReadOptions.SPLIT_FILE_OPEN_COST, Long.toString(newSplitOpenFileCost)); - } - - return this; - } - - public Builder streaming(boolean streaming) { - readOptions.put(FlinkReadOptions.STREAMING, Boolean.toString(streaming)); - return this; - } - - public Builder monitorInterval(Duration newMonitorInterval) { - if (newMonitorInterval != null) { - readOptions.put(FlinkReadOptions.MONITOR_INTERVAL, newMonitorInterval.toNanos() + " ns"); - } - return this; - } - - public Builder nameMapping(String newNameMapping) { - readOptions.put(TableProperties.DEFAULT_NAME_MAPPING, newNameMapping); - return this; - } - - public Builder project(Schema newProjectedSchema) { - this.contextBuilder.project(newProjectedSchema); - return this; - } - - /** - * @deprecated since 1.10.0, will be removed in 2.0.0. Use {@link #project(ResolvedSchema)} - * instead. - */ - @Deprecated - public Builder project(TableSchema newProjectedFlinkSchema) { - this.projectedTableSchema = newProjectedFlinkSchema; - return this; - } - - public Builder project(ResolvedSchema newProjectedFlinkSchema) { - this.projectedFlinkSchema = newProjectedFlinkSchema; - return this; - } - - public Builder filters(List newFilters) { - this.contextBuilder.filters(newFilters); - return this; - } - - public Builder limit(Long newLimit) { - if (newLimit != null) { - readOptions.put(FlinkReadOptions.LIMIT, Long.toString(newLimit)); - } - return this; - } - - public Builder includeColumnStats(boolean newIncludeColumnStats) { - readOptions.put( - FlinkReadOptions.INCLUDE_COLUMN_STATS, Boolean.toString(newIncludeColumnStats)); - return this; - } - - public Builder planParallelism(int planParallelism) { - readOptions.put( - FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.key(), - Integer.toString(planParallelism)); - return this; - } - - public Builder exposeLocality(boolean newExposeLocality) { - this.exposeLocality = newExposeLocality; - return this; - } - - public Builder maxAllowedPlanningFailures(int maxAllowedPlanningFailures) { - readOptions.put( - FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION.key(), - Integer.toString(maxAllowedPlanningFailures)); - return this; - } - - /** - * Set the read properties for Flink source. View the supported properties in {@link - * FlinkReadOptions} - */ - public Builder set(String property, String value) { - readOptions.put(property, value); - return this; - } - - /** - * Set the read properties for Flink source. View the supported properties in {@link - * FlinkReadOptions} - */ - public Builder setAll(Map properties) { - readOptions.putAll(properties); - return this; - } - - /** - * Emits watermarks once per split based on the min value of column statistics from files - * metadata in the given split. The generated watermarks are also used for ordering the splits - * for read. Accepted column types are timestamp/timestamptz/long. For long columns consider - * setting {@link #watermarkColumnTimeUnit(TimeUnit)}. - * - *

    Consider setting `read.split.open-file-cost` to prevent combining small files to a single - * split when the watermark is used for watermark alignment. - */ - public Builder watermarkColumn(String columnName) { - Preconditions.checkArgument( - splitAssignerFactory == null, - "Watermark column and SplitAssigner should not be set in the same source"); - readOptions.put(FlinkReadOptions.WATERMARK_COLUMN, columnName); - return this; - } - - /** - * When the type of the {@link #watermarkColumn} is {@link - * org.apache.iceberg.types.Types.LongType}, then sets the {@link TimeUnit} to convert the - * value. The default value is {@link TimeUnit#MICROSECONDS}. - */ - public Builder watermarkColumnTimeUnit(TimeUnit timeUnit) { - readOptions.put(FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT, timeUnit.name()); - return this; - } - - /** - * @deprecated will be removed in 2.0.0; use {@link #setAll} instead. - */ - @Deprecated - public Builder properties(Map properties) { - readOptions.putAll(properties); - return this; - } - - public IcebergSource build() { - if (table == null) { - try (TableLoader loader = tableLoader) { - loader.open(); - this.table = tableLoader.loadTable(); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - contextBuilder.resolveConfig(table, readOptions, flinkConfig); - contextBuilder.exposeLocality( - SourceUtil.isLocalityEnabled(table, flinkConfig, exposeLocality)); - contextBuilder.planParallelism( - flinkConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE)); - Schema icebergSchema = table.schema(); - if (projectedFlinkSchema != null) { - contextBuilder.project(FlinkSchemaUtil.convert(icebergSchema, projectedFlinkSchema)); - } else if (projectedTableSchema != null) { - contextBuilder.project(FlinkSchemaUtil.convert(icebergSchema, projectedTableSchema)); - } - - SerializableRecordEmitter emitter = SerializableRecordEmitter.defaultEmitter(); - FlinkReadConf flinkReadConf = new FlinkReadConf(table, readOptions, flinkConfig); - String watermarkColumn = flinkReadConf.watermarkColumn(); - TimeUnit watermarkTimeUnit = flinkReadConf.watermarkColumnTimeUnit(); - - if (watermarkColumn != null) { - // Column statistics is needed for watermark generation - contextBuilder.includeColumnStats(Sets.newHashSet(watermarkColumn)); - - SplitWatermarkExtractor watermarkExtractor = - new ColumnStatsWatermarkExtractor(icebergSchema, watermarkColumn, watermarkTimeUnit); - emitter = SerializableRecordEmitter.emitterWithWatermark(watermarkExtractor); - splitAssignerFactory = - new OrderedSplitAssignerFactory(SplitComparators.watermark(watermarkExtractor)); - } - - ScanContext context = contextBuilder.build(); - context.validate(); - if (readerFunction == null) { - this.readerFunction = readerFunction(context); - } - - if (splitAssignerFactory == null) { - if (splitComparator == null) { - splitAssignerFactory = new SimpleSplitAssignerFactory(); - } else { - splitAssignerFactory = new OrderedSplitAssignerFactory(splitComparator); - } - } - - // Since builder already load the table, pass it to the source to avoid double loading - return new IcebergSource<>( - tableLoader, - context, - readerFunction, - splitAssignerFactory, - splitComparator, - table, - emitter); - } - - /** - * Build the {@link IcebergSource} and create a {@link DataStream} from the source. Watermark - * strategy is set to {@link WatermarkStrategy#noWatermarks()}. - * - * @return data stream from the Iceberg source - */ - public DataStream buildStream(StreamExecutionEnvironment env) { - // buildStream should only be called with RowData or Converter paths. - Preconditions.checkState( - readerFunction == null, - "Cannot set reader function when building a data stream from the source"); - IcebergSource source = build(); - TypeInformation outputTypeInfo = - outputTypeInfo(converter, table.schema(), source.scanContext.project()); - DataStreamSource stream = - env.fromSource(source, WatermarkStrategy.noWatermarks(), source.name(), outputTypeInfo); - if (source.shouldInferParallelism()) { - stream = stream.setParallelism(source.inferParallelism(flinkConfig, env)); - } - - return stream; - } - - private static TypeInformation outputTypeInfo( - RowDataConverter converter, Schema tableSchema, Schema projected) { - if (converter != null) { - return converter.getProducedType(); - } else { - // output type is RowData - Schema readSchema = projected != null ? projected : tableSchema; - return (TypeInformation) - FlinkCompatibilityUtil.toTypeInfo(FlinkSchemaUtil.convert(readSchema)); - } - } - - private ReaderFunction readerFunction(ScanContext context) { - if (table instanceof BaseMetadataTable) { - MetaDataReaderFunction rowDataReaderFunction = - new MetaDataReaderFunction( - flinkConfig, table.schema(), context.project(), table.io(), table.encryption()); - return (ReaderFunction) rowDataReaderFunction; - } else { - if (converter == null) { - return (ReaderFunction) - new RowDataReaderFunction( - flinkConfig, - table.schema(), - context.project(), - context.nameMapping(), - context.caseSensitive(), - table.io(), - table.encryption(), - context.filters(), - context.limit()); - } else { - return new ConverterReaderFunction<>( - converter, - flinkConfig, - table.schema(), - context.project(), - context.nameMapping(), - context.caseSensitive(), - table.io(), - table.encryption(), - context.filters(), - context.limit()); - } - } - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java deleted file mode 100644 index e2d131dc3b3d..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.stream.Collectors; -import org.apache.flink.annotation.Internal; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.connector.ChangelogMode; -import org.apache.flink.table.connector.ProviderContext; -import org.apache.flink.table.connector.source.DataStreamScanProvider; -import org.apache.flink.table.connector.source.DynamicTableSource; -import org.apache.flink.table.connector.source.ScanTableSource; -import org.apache.flink.table.connector.source.abilities.SupportsFilterPushDown; -import org.apache.flink.table.connector.source.abilities.SupportsLimitPushDown; -import org.apache.flink.table.connector.source.abilities.SupportsProjectionPushDown; -import org.apache.flink.table.connector.source.abilities.SupportsSourceWatermark; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.expressions.ResolvedExpression; -import org.apache.flink.table.factories.FactoryUtil; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.FlinkFilters; -import org.apache.iceberg.flink.FlinkReadOptions; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.source.assigner.SplitAssignerType; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.PropertyUtil; - -/** Flink Iceberg table source. */ -@Internal -public class IcebergTableSource - implements ScanTableSource, - SupportsProjectionPushDown, - SupportsFilterPushDown, - SupportsLimitPushDown, - SupportsSourceWatermark { - - private int[] projectedFields; - private Long limit; - private List filters; - - private final TableLoader loader; - private final ResolvedSchema schema; - private final Map properties; - private final boolean isLimitPushDown; - private final ReadableConfig readableConfig; - - private IcebergTableSource(IcebergTableSource toCopy) { - this.loader = toCopy.loader; - this.schema = toCopy.schema; - this.properties = toCopy.properties; - this.projectedFields = toCopy.projectedFields; - this.isLimitPushDown = toCopy.isLimitPushDown; - this.limit = toCopy.limit; - this.filters = toCopy.filters; - this.readableConfig = toCopy.readableConfig; - } - - public IcebergTableSource( - TableLoader loader, - ResolvedSchema schema, - Map properties, - ReadableConfig readableConfig) { - this(loader, schema, properties, null, false, null, ImmutableList.of(), readableConfig); - } - - private IcebergTableSource( - TableLoader loader, - ResolvedSchema schema, - Map properties, - int[] projectedFields, - boolean isLimitPushDown, - Long limit, - List filters, - ReadableConfig readableConfig) { - this.loader = loader; - this.schema = schema; - this.properties = properties; - this.projectedFields = projectedFields; - this.isLimitPushDown = isLimitPushDown; - this.limit = limit; - this.filters = filters; - this.readableConfig = readableConfig; - } - - @Override - public void applyProjection(int[][] projectFields) { - this.projectedFields = new int[projectFields.length]; - for (int i = 0; i < projectFields.length; i++) { - Preconditions.checkArgument( - projectFields[i].length == 1, "Don't support nested projection in iceberg source now."); - this.projectedFields[i] = projectFields[i][0]; - } - } - - private DataStream createDataStream(StreamExecutionEnvironment execEnv) { - return FlinkSource.forRowData() - .env(execEnv) - .tableLoader(loader) - .setAll(properties) - .project(TableSchema.fromResolvedSchema(getProjectedSchema())) - .limit(limit) - .filters(filters) - .flinkConf(readableConfig) - .build(); - } - - private DataStream createFLIP27Stream(StreamExecutionEnvironment env) { - SplitAssignerType assignerType = - readableConfig.get(FlinkConfigOptions.TABLE_EXEC_SPLIT_ASSIGNER_TYPE); - return IcebergSource.forRowData() - .tableLoader(loader) - .assignerFactory(assignerType.factory()) - .setAll(properties) - .project(getProjectedSchema()) - .limit(limit) - .filters(filters) - .flinkConfig(readableConfig) - .buildStream(env); - } - - private ResolvedSchema getProjectedSchema() { - if (projectedFields == null) { - return schema; - } else { - List fullColumns = schema.getColumns(); - return ResolvedSchema.of( - Arrays.stream(projectedFields).mapToObj(fullColumns::get).collect(Collectors.toList())); - } - } - - @Override - public void applyLimit(long newLimit) { - this.limit = newLimit; - } - - @Override - public Result applyFilters(List flinkFilters) { - List acceptedFilters = Lists.newArrayList(); - List expressions = Lists.newArrayList(); - - for (ResolvedExpression resolvedExpression : flinkFilters) { - Optional icebergExpression = FlinkFilters.convert(resolvedExpression); - if (icebergExpression.isPresent()) { - expressions.add(icebergExpression.get()); - acceptedFilters.add(resolvedExpression); - } - } - - this.filters = expressions; - return Result.of(acceptedFilters, flinkFilters); - } - - @Override - public void applySourceWatermark() { - Preconditions.checkArgument( - readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE), - "Source watermarks are supported only in flip-27 iceberg source implementation"); - - Preconditions.checkNotNull( - properties.get(FlinkReadOptions.WATERMARK_COLUMN), - "watermark-column needs to be configured to use source watermark."); - } - - @Override - public boolean supportsNestedProjection() { - // TODO: support nested projection - return false; - } - - @Override - public ChangelogMode getChangelogMode() { - return ChangelogMode.insertOnly(); - } - - @Override - public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderContext) { - return new DataStreamScanProvider() { - @Override - public DataStream produceDataStream( - ProviderContext providerContext, StreamExecutionEnvironment execEnv) { - if (readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE)) { - return createFLIP27Stream(execEnv); - } else { - return createDataStream(execEnv); - } - } - - @Override - public boolean isBounded() { - return FlinkSource.isBounded(properties); - } - - @Override - public Optional getParallelism() { - return Optional.ofNullable( - PropertyUtil.propertyAsNullableInt(properties, FactoryUtil.SOURCE_PARALLELISM.key())); - } - }; - } - - @Override - public DynamicTableSource copy() { - return new IcebergTableSource(this); - } - - @Override - public String asSummaryString() { - return "Iceberg table source"; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java deleted file mode 100644 index bf6f72cc287a..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java +++ /dev/null @@ -1,243 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.List; -import java.util.Map; -import org.apache.flink.annotation.Internal; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.data.DeleteFilter; -import org.apache.iceberg.encryption.InputFilesDecryptor; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.FlinkSourceFilter; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.data.FlinkOrcReader; -import org.apache.iceberg.flink.data.FlinkParquetReaders; -import org.apache.iceberg.flink.data.FlinkPlannedAvroReader; -import org.apache.iceberg.flink.data.RowDataProjection; -import org.apache.iceberg.flink.data.RowDataUtil; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.io.InputFile; -import org.apache.iceberg.mapping.NameMappingParser; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.PartitionUtil; - -@Internal -public class RowDataFileScanTaskReader implements FileScanTaskReader { - - private final Schema tableSchema; - private final Schema projectedSchema; - private final String nameMapping; - private final boolean caseSensitive; - private final FlinkSourceFilter rowFilter; - - public RowDataFileScanTaskReader( - Schema tableSchema, - Schema projectedSchema, - String nameMapping, - boolean caseSensitive, - List filters) { - this.tableSchema = tableSchema; - this.projectedSchema = projectedSchema; - this.nameMapping = nameMapping; - this.caseSensitive = caseSensitive; - - if (filters != null && !filters.isEmpty()) { - Expression combinedExpression = - filters.stream().reduce(Expressions.alwaysTrue(), Expressions::and); - this.rowFilter = - new FlinkSourceFilter(this.projectedSchema, combinedExpression, this.caseSensitive); - } else { - this.rowFilter = null; - } - } - - @Override - public CloseableIterator open( - FileScanTask task, InputFilesDecryptor inputFilesDecryptor) { - Schema partitionSchema = TypeUtil.select(projectedSchema, task.spec().identitySourceIds()); - - Map idToConstant = - partitionSchema.columns().isEmpty() - ? ImmutableMap.of() - : PartitionUtil.constantsMap(task, RowDataUtil::convertConstant); - - FlinkDeleteFilter deletes = - new FlinkDeleteFilter(task, tableSchema, projectedSchema, inputFilesDecryptor); - CloseableIterable iterable = - deletes.filter( - newIterable(task, deletes.requiredSchema(), idToConstant, inputFilesDecryptor)); - - // Project the RowData to remove the extra meta columns. - if (!projectedSchema.sameSchema(deletes.requiredSchema())) { - RowDataProjection rowDataProjection = - RowDataProjection.create( - deletes.requiredRowType(), - deletes.requiredSchema().asStruct(), - projectedSchema.asStruct()); - iterable = CloseableIterable.transform(iterable, rowDataProjection::wrap); - } - - return iterable.iterator(); - } - - private CloseableIterable newIterable( - FileScanTask task, - Schema schema, - Map idToConstant, - InputFilesDecryptor inputFilesDecryptor) { - CloseableIterable iter; - if (task.isDataTask()) { - throw new UnsupportedOperationException("Cannot read data task."); - } else { - switch (task.file().format()) { - case PARQUET: - iter = newParquetIterable(task, schema, idToConstant, inputFilesDecryptor); - break; - - case AVRO: - iter = newAvroIterable(task, schema, idToConstant, inputFilesDecryptor); - break; - - case ORC: - iter = newOrcIterable(task, schema, idToConstant, inputFilesDecryptor); - break; - - default: - throw new UnsupportedOperationException( - "Cannot read unknown format: " + task.file().format()); - } - } - - if (rowFilter != null) { - return CloseableIterable.filter(iter, rowFilter::filter); - } - return iter; - } - - private CloseableIterable newAvroIterable( - FileScanTask task, - Schema schema, - Map idToConstant, - InputFilesDecryptor inputFilesDecryptor) { - Avro.ReadBuilder builder = - Avro.read(inputFilesDecryptor.getInputFile(task)) - .reuseContainers() - .project(schema) - .split(task.start(), task.length()) - .createReaderFunc(readSchema -> FlinkPlannedAvroReader.create(schema, idToConstant)); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - return builder.build(); - } - - private CloseableIterable newParquetIterable( - FileScanTask task, - Schema schema, - Map idToConstant, - InputFilesDecryptor inputFilesDecryptor) { - Parquet.ReadBuilder builder = - Parquet.read(inputFilesDecryptor.getInputFile(task)) - .split(task.start(), task.length()) - .project(schema) - .createReaderFunc( - fileSchema -> FlinkParquetReaders.buildReader(schema, fileSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive) - .reuseContainers(); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - return builder.build(); - } - - private CloseableIterable newOrcIterable( - FileScanTask task, - Schema schema, - Map idToConstant, - InputFilesDecryptor inputFilesDecryptor) { - Schema readSchemaWithoutConstantAndMetadataFields = - TypeUtil.selectNot( - schema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); - - ORC.ReadBuilder builder = - ORC.read(inputFilesDecryptor.getInputFile(task)) - .project(readSchemaWithoutConstantAndMetadataFields) - .split(task.start(), task.length()) - .createReaderFunc( - readOrcSchema -> new FlinkOrcReader(schema, readOrcSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - return builder.build(); - } - - private static class FlinkDeleteFilter extends DeleteFilter { - private final RowType requiredRowType; - private final RowDataWrapper asStructLike; - private final InputFilesDecryptor inputFilesDecryptor; - - FlinkDeleteFilter( - FileScanTask task, - Schema tableSchema, - Schema requestedSchema, - InputFilesDecryptor inputFilesDecryptor) { - super(task.file().location(), task.deletes(), tableSchema, requestedSchema); - this.requiredRowType = FlinkSchemaUtil.convert(requiredSchema()); - this.asStructLike = new RowDataWrapper(requiredRowType, requiredSchema().asStruct()); - this.inputFilesDecryptor = inputFilesDecryptor; - } - - public RowType requiredRowType() { - return requiredRowType; - } - - @Override - protected StructLike asStructLike(RowData row) { - return asStructLike.wrap(row); - } - - @Override - protected InputFile getInputFile(String location) { - return inputFilesDecryptor.getInputFile(location); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java deleted file mode 100644 index 391633924264..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; - -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.api.common.functions.RichMapFunction; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.sink.RowDataTaskWriterFactory; -import org.apache.iceberg.flink.sink.TaskWriterFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.PropertyUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class RowDataRewriter { - - private static final Logger LOG = LoggerFactory.getLogger(RowDataRewriter.class); - - private final Schema schema; - private final String nameMapping; - private final FileIO io; - private final boolean caseSensitive; - private final EncryptionManager encryptionManager; - private final TaskWriterFactory taskWriterFactory; - private final String tableName; - - public RowDataRewriter( - Table table, boolean caseSensitive, FileIO io, EncryptionManager encryptionManager) { - this.schema = table.schema(); - this.caseSensitive = caseSensitive; - this.io = io; - this.encryptionManager = encryptionManager; - this.nameMapping = - PropertyUtil.propertyAsString(table.properties(), DEFAULT_NAME_MAPPING, null); - this.tableName = table.name(); - - String formatString = - PropertyUtil.propertyAsString( - table.properties(), - TableProperties.DEFAULT_FILE_FORMAT, - TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); - FileFormat format = FileFormat.fromString(formatString); - RowType flinkSchema = FlinkSchemaUtil.convert(table.schema()); - this.taskWriterFactory = - new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - flinkSchema, - Long.MAX_VALUE, - format, - table.properties(), - null, - false); - } - - public List rewriteDataForTasks( - DataStream dataStream, int parallelism) throws Exception { - RewriteMap map = - new RewriteMap( - schema, nameMapping, io, caseSensitive, encryptionManager, taskWriterFactory); - DataStream> ds = dataStream.map(map).setParallelism(parallelism); - return Lists.newArrayList(ds.executeAndCollect("Rewrite table :" + tableName)).stream() - .flatMap(Collection::stream) - .collect(Collectors.toList()); - } - - public static class RewriteMap extends RichMapFunction> { - - private TaskWriter writer; - private int subTaskId; - private int attemptId; - - private final FileIO io; - private final EncryptionManager encryptionManager; - private final TaskWriterFactory taskWriterFactory; - private final RowDataFileScanTaskReader rowDataReader; - - public RewriteMap( - Schema schema, - String nameMapping, - FileIO io, - boolean caseSensitive, - EncryptionManager encryptionManager, - TaskWriterFactory taskWriterFactory) { - this.io = io; - this.encryptionManager = encryptionManager; - this.taskWriterFactory = taskWriterFactory; - this.rowDataReader = - new RowDataFileScanTaskReader( - schema, schema, nameMapping, caseSensitive, Collections.emptyList()); - } - - @Override - public void open(Configuration parameters) { - this.subTaskId = getRuntimeContext().getIndexOfThisSubtask(); - this.attemptId = getRuntimeContext().getAttemptNumber(); - // Initialize the task writer factory. - this.taskWriterFactory.initialize(subTaskId, attemptId); - } - - @Override - public List map(CombinedScanTask task) throws Exception { - // Initialize the task writer. - this.writer = taskWriterFactory.create(); - try (DataIterator iterator = - new DataIterator<>(rowDataReader, task, io, encryptionManager)) { - while (iterator.hasNext()) { - RowData rowData = iterator.next(); - writer.write(rowData); - } - return Lists.newArrayList(writer.dataFiles()); - } catch (Throwable originalThrowable) { - try { - LOG.error("Aborting commit for (subTaskId {}, attemptId {})", subTaskId, attemptId); - writer.abort(); - LOG.error("Aborted commit for (subTaskId {}, attemptId {})", subTaskId, attemptId); - } catch (Throwable inner) { - if (originalThrowable != inner) { - originalThrowable.addSuppressed(inner); - LOG.warn("Suppressing exception in catch: {}", inner.getMessage(), inner); - } - } - - if (originalThrowable instanceof Exception) { - throw originalThrowable; - } else { - throw new RuntimeException(originalThrowable); - } - } - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java deleted file mode 100644 index 8ef1f1fbb833..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.Serializable; -import java.util.function.Function; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.flink.annotation.Internal; -import org.apache.flink.formats.avro.RowDataToAvroConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.DataType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.iceberg.avro.AvroSchemaUtil; -import org.apache.iceberg.flink.FlinkSchemaUtil; - -/** - * This is not serializable because Avro {@link Schema} is not actually serializable, even though it - * implements {@link Serializable} interface. - */ -@Internal -public class RowDataToAvroGenericRecordConverter implements Function { - private final RowDataToAvroConverters.RowDataToAvroConverter converter; - private final Schema avroSchema; - - private RowDataToAvroGenericRecordConverter(RowType rowType, Schema avroSchema) { - this.converter = RowDataToAvroConverters.createConverter(rowType); - this.avroSchema = avroSchema; - } - - @Override - public GenericRecord apply(RowData rowData) { - return (GenericRecord) converter.convert(avroSchema, rowData); - } - - /** Create a converter based on Iceberg schema */ - public static RowDataToAvroGenericRecordConverter fromIcebergSchema( - String tableName, org.apache.iceberg.Schema icebergSchema) { - RowType rowType = FlinkSchemaUtil.convert(icebergSchema); - Schema avroSchema = AvroSchemaUtil.convert(icebergSchema, tableName); - return new RowDataToAvroGenericRecordConverter(rowType, avroSchema); - } - - /** Create a mapper based on Avro schema */ - public static RowDataToAvroGenericRecordConverter fromAvroSchema(Schema avroSchema) { - DataType dataType = AvroSchemaConverter.convertToDataType(avroSchema.toString()); - LogicalType logicalType = TypeConversions.fromDataToLogicalType(dataType); - RowType rowType = RowType.of(logicalType.getChildren().toArray(new LogicalType[0])); - return new RowDataToAvroGenericRecordConverter(rowType, avroSchema); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java deleted file mode 100644 index bac7c05bdfef..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java +++ /dev/null @@ -1,597 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.Serializable; -import java.time.Duration; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.concurrent.TimeUnit; -import org.apache.flink.annotation.Internal; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.util.TimeUtils; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.FlinkReadConf; -import org.apache.iceberg.flink.FlinkReadOptions; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** Context object with optional arguments for a Flink Scan. */ -@Internal -public class ScanContext implements Serializable { - - private static final long serialVersionUID = 1L; - - private final boolean caseSensitive; - private final boolean exposeLocality; - private final Long snapshotId; - private final String branch; - private final String tag; - private final StreamingStartingStrategy startingStrategy; - private final Long startSnapshotId; - private final Long startSnapshotTimestamp; - private final Long endSnapshotId; - private final Long asOfTimestamp; - private final String startTag; - private final String endTag; - private final Long splitSize; - private final Integer splitLookback; - private final Long splitOpenFileCost; - private final boolean isStreaming; - private final Duration monitorInterval; - - private final String nameMapping; - private final Schema schema; - private final List filters; - private final long limit; - private final boolean includeColumnStats; - private final Collection includeStatsForColumns; - private final Integer planParallelism; - private final int maxPlanningSnapshotCount; - private final int maxAllowedPlanningFailures; - private final String watermarkColumn; - private final TimeUnit watermarkColumnTimeUnit; - - private ScanContext( - boolean caseSensitive, - Long snapshotId, - StreamingStartingStrategy startingStrategy, - Long startSnapshotTimestamp, - Long startSnapshotId, - Long endSnapshotId, - Long asOfTimestamp, - Long splitSize, - Integer splitLookback, - Long splitOpenFileCost, - boolean isStreaming, - Duration monitorInterval, - String nameMapping, - Schema schema, - List filters, - long limit, - boolean includeColumnStats, - Collection includeStatsForColumns, - boolean exposeLocality, - Integer planParallelism, - int maxPlanningSnapshotCount, - int maxAllowedPlanningFailures, - String watermarkColumn, - TimeUnit watermarkColumnTimeUnit, - String branch, - String tag, - String startTag, - String endTag) { - this.caseSensitive = caseSensitive; - this.snapshotId = snapshotId; - this.tag = tag; - this.branch = branch; - this.startingStrategy = startingStrategy; - this.startSnapshotTimestamp = startSnapshotTimestamp; - this.startSnapshotId = startSnapshotId; - this.endSnapshotId = endSnapshotId; - this.asOfTimestamp = asOfTimestamp; - this.startTag = startTag; - this.endTag = endTag; - this.splitSize = splitSize; - this.splitLookback = splitLookback; - this.splitOpenFileCost = splitOpenFileCost; - this.isStreaming = isStreaming; - this.monitorInterval = monitorInterval; - - this.nameMapping = nameMapping; - this.schema = schema; - this.filters = filters; - this.limit = limit; - this.includeColumnStats = includeColumnStats; - this.includeStatsForColumns = includeStatsForColumns; - this.exposeLocality = exposeLocality; - this.planParallelism = planParallelism; - this.maxPlanningSnapshotCount = maxPlanningSnapshotCount; - this.maxAllowedPlanningFailures = maxAllowedPlanningFailures; - this.watermarkColumn = watermarkColumn; - this.watermarkColumnTimeUnit = watermarkColumnTimeUnit; - } - - void validate() { - if (isStreaming) { - if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) { - Preconditions.checkArgument( - startSnapshotId != null, - "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); - Preconditions.checkArgument( - startSnapshotTimestamp == null, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); - } - if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) { - Preconditions.checkArgument( - startSnapshotTimestamp != null, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); - Preconditions.checkArgument( - startSnapshotId == null, - "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); - } - - Preconditions.checkArgument( - tag == null, - String.format("Cannot scan table using ref %s configured for streaming reader", tag)); - Preconditions.checkArgument( - snapshotId == null, "Cannot set snapshot-id option for streaming reader"); - Preconditions.checkArgument( - asOfTimestamp == null, "Cannot set as-of-timestamp option for streaming reader"); - Preconditions.checkArgument( - endSnapshotId == null, "Cannot set end-snapshot-id option for streaming reader"); - Preconditions.checkArgument(endTag == null, "Cannot set end-tag option for streaming reader"); - } - - Preconditions.checkArgument( - !(startTag != null && startSnapshotId() != null), - "START_SNAPSHOT_ID and START_TAG cannot both be set."); - - Preconditions.checkArgument( - !(endTag != null && endSnapshotId() != null), - "END_SNAPSHOT_ID and END_TAG cannot both be set."); - - Preconditions.checkArgument( - maxAllowedPlanningFailures >= -1, - "Cannot set maxAllowedPlanningFailures to a negative number other than -1."); - } - - public boolean caseSensitive() { - return caseSensitive; - } - - public Long snapshotId() { - return snapshotId; - } - - public String branch() { - return branch; - } - - public String tag() { - return tag; - } - - public String startTag() { - return startTag; - } - - public String endTag() { - return endTag; - } - - public StreamingStartingStrategy streamingStartingStrategy() { - return startingStrategy; - } - - public Long startSnapshotTimestamp() { - return startSnapshotTimestamp; - } - - public Long startSnapshotId() { - return startSnapshotId; - } - - public Long endSnapshotId() { - return endSnapshotId; - } - - public Long asOfTimestamp() { - return asOfTimestamp; - } - - public Long splitSize() { - return splitSize; - } - - public Integer splitLookback() { - return splitLookback; - } - - public Long splitOpenFileCost() { - return splitOpenFileCost; - } - - public boolean isStreaming() { - return isStreaming; - } - - public Duration monitorInterval() { - return monitorInterval; - } - - public String nameMapping() { - return nameMapping; - } - - public Schema project() { - return schema; - } - - public List filters() { - return filters; - } - - public long limit() { - return limit; - } - - public boolean includeColumnStats() { - return includeColumnStats; - } - - public Collection includeStatsForColumns() { - return includeStatsForColumns; - } - - public boolean exposeLocality() { - return exposeLocality; - } - - public Integer planParallelism() { - return planParallelism; - } - - public int maxPlanningSnapshotCount() { - return maxPlanningSnapshotCount; - } - - public int maxAllowedPlanningFailures() { - return maxAllowedPlanningFailures; - } - - public String watermarkColumn() { - return watermarkColumn; - } - - public TimeUnit watermarkColumnTimeUnit() { - return watermarkColumnTimeUnit; - } - - public ScanContext copyWithAppendsBetween(Long newStartSnapshotId, long newEndSnapshotId) { - return ScanContext.builder() - .caseSensitive(caseSensitive) - .useSnapshotId(null) - .useBranch(branch) - .useTag(null) - .startSnapshotId(newStartSnapshotId) - .endSnapshotId(newEndSnapshotId) - .startTag(null) - .endTag(null) - .asOfTimestamp(null) - .splitSize(splitSize) - .splitLookback(splitLookback) - .splitOpenFileCost(splitOpenFileCost) - .streaming(isStreaming) - .monitorInterval(monitorInterval) - .nameMapping(nameMapping) - .project(schema) - .filters(filters) - .limit(limit) - .includeColumnStats(includeColumnStats) - .includeColumnStats(includeStatsForColumns) - .exposeLocality(exposeLocality) - .planParallelism(planParallelism) - .maxPlanningSnapshotCount(maxPlanningSnapshotCount) - .maxAllowedPlanningFailures(maxAllowedPlanningFailures) - .watermarkColumn(watermarkColumn) - .watermarkColumnTimeUnit(watermarkColumnTimeUnit) - .build(); - } - - public ScanContext copyWithSnapshotId(long newSnapshotId) { - return ScanContext.builder() - .caseSensitive(caseSensitive) - .useSnapshotId(newSnapshotId) - .useBranch(branch) - .useTag(tag) - .startSnapshotId(null) - .endSnapshotId(null) - .startTag(null) - .endTag(null) - .asOfTimestamp(null) - .splitSize(splitSize) - .splitLookback(splitLookback) - .splitOpenFileCost(splitOpenFileCost) - .streaming(isStreaming) - .monitorInterval(monitorInterval) - .nameMapping(nameMapping) - .project(schema) - .filters(filters) - .limit(limit) - .includeColumnStats(includeColumnStats) - .includeColumnStats(includeStatsForColumns) - .exposeLocality(exposeLocality) - .planParallelism(planParallelism) - .maxPlanningSnapshotCount(maxPlanningSnapshotCount) - .maxAllowedPlanningFailures(maxAllowedPlanningFailures) - .watermarkColumn(watermarkColumn) - .watermarkColumnTimeUnit(watermarkColumnTimeUnit) - .build(); - } - - public static Builder builder() { - return new Builder(); - } - - public static class Builder { - private boolean caseSensitive = FlinkReadOptions.CASE_SENSITIVE_OPTION.defaultValue(); - private Long snapshotId = FlinkReadOptions.SNAPSHOT_ID.defaultValue(); - private String branch = FlinkReadOptions.BRANCH.defaultValue(); - private String tag = FlinkReadOptions.TAG.defaultValue(); - private String startTag = FlinkReadOptions.START_TAG.defaultValue(); - private String endTag = FlinkReadOptions.END_TAG.defaultValue(); - private StreamingStartingStrategy startingStrategy = - FlinkReadOptions.STARTING_STRATEGY_OPTION.defaultValue(); - private Long startSnapshotTimestamp = FlinkReadOptions.START_SNAPSHOT_TIMESTAMP.defaultValue(); - private Long startSnapshotId = FlinkReadOptions.START_SNAPSHOT_ID.defaultValue(); - private Long endSnapshotId = FlinkReadOptions.END_SNAPSHOT_ID.defaultValue(); - private Long asOfTimestamp = FlinkReadOptions.AS_OF_TIMESTAMP.defaultValue(); - private Long splitSize = FlinkReadOptions.SPLIT_SIZE_OPTION.defaultValue(); - private Integer splitLookback = FlinkReadOptions.SPLIT_LOOKBACK_OPTION.defaultValue(); - private Long splitOpenFileCost = FlinkReadOptions.SPLIT_FILE_OPEN_COST_OPTION.defaultValue(); - private boolean isStreaming = FlinkReadOptions.STREAMING_OPTION.defaultValue(); - private Duration monitorInterval = - TimeUtils.parseDuration(FlinkReadOptions.MONITOR_INTERVAL_OPTION.defaultValue()); - private String nameMapping; - private Schema projectedSchema; - private List filters; - private long limit = FlinkReadOptions.LIMIT_OPTION.defaultValue(); - private boolean includeColumnStats = - FlinkReadOptions.INCLUDE_COLUMN_STATS_OPTION.defaultValue(); - private Collection includeStatsForColumns = null; - private boolean exposeLocality; - private Integer planParallelism = - FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue(); - private int maxPlanningSnapshotCount = - FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT_OPTION.defaultValue(); - private int maxAllowedPlanningFailures = - FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION.defaultValue(); - private String watermarkColumn = FlinkReadOptions.WATERMARK_COLUMN_OPTION.defaultValue(); - private TimeUnit watermarkColumnTimeUnit = - FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT_OPTION.defaultValue(); - - private Builder() {} - - public Builder caseSensitive(boolean newCaseSensitive) { - this.caseSensitive = newCaseSensitive; - return this; - } - - public Builder useSnapshotId(Long newSnapshotId) { - this.snapshotId = newSnapshotId; - return this; - } - - public Builder useTag(String newTag) { - this.tag = newTag; - return this; - } - - public Builder useBranch(String newBranch) { - this.branch = newBranch; - return this; - } - - public Builder startingStrategy(StreamingStartingStrategy newStartingStrategy) { - this.startingStrategy = newStartingStrategy; - return this; - } - - public Builder startSnapshotTimestamp(Long newStartSnapshotTimestamp) { - this.startSnapshotTimestamp = newStartSnapshotTimestamp; - return this; - } - - public Builder startSnapshotId(Long newStartSnapshotId) { - this.startSnapshotId = newStartSnapshotId; - return this; - } - - public Builder endSnapshotId(Long newEndSnapshotId) { - this.endSnapshotId = newEndSnapshotId; - return this; - } - - public Builder startTag(String newStartTag) { - this.startTag = newStartTag; - return this; - } - - public Builder endTag(String newEndTag) { - this.endTag = newEndTag; - return this; - } - - public Builder asOfTimestamp(Long newAsOfTimestamp) { - this.asOfTimestamp = newAsOfTimestamp; - return this; - } - - public Builder splitSize(Long newSplitSize) { - this.splitSize = newSplitSize; - return this; - } - - public Builder splitLookback(Integer newSplitLookback) { - this.splitLookback = newSplitLookback; - return this; - } - - public Builder splitOpenFileCost(Long newSplitOpenFileCost) { - this.splitOpenFileCost = newSplitOpenFileCost; - return this; - } - - public Builder streaming(boolean streaming) { - this.isStreaming = streaming; - return this; - } - - public Builder monitorInterval(Duration newMonitorInterval) { - this.monitorInterval = newMonitorInterval; - return this; - } - - public Builder nameMapping(String newNameMapping) { - this.nameMapping = newNameMapping; - return this; - } - - public Builder project(Schema newProjectedSchema) { - this.projectedSchema = newProjectedSchema; - return this; - } - - public Builder filters(List newFilters) { - this.filters = newFilters; - return this; - } - - public Builder limit(long newLimit) { - this.limit = newLimit; - return this; - } - - public Builder includeColumnStats(boolean newIncludeColumnStats) { - this.includeColumnStats = newIncludeColumnStats; - return this; - } - - public Builder includeColumnStats(Collection newIncludeStatsForColumns) { - this.includeStatsForColumns = newIncludeStatsForColumns; - return this; - } - - public Builder exposeLocality(boolean newExposeLocality) { - this.exposeLocality = newExposeLocality; - return this; - } - - public Builder planParallelism(Integer parallelism) { - this.planParallelism = parallelism; - return this; - } - - public Builder maxPlanningSnapshotCount(int newMaxPlanningSnapshotCount) { - this.maxPlanningSnapshotCount = newMaxPlanningSnapshotCount; - return this; - } - - public Builder maxAllowedPlanningFailures(int newMaxAllowedPlanningFailures) { - this.maxAllowedPlanningFailures = newMaxAllowedPlanningFailures; - return this; - } - - public Builder watermarkColumn(String newWatermarkColumn) { - this.watermarkColumn = newWatermarkColumn; - return this; - } - - public Builder watermarkColumnTimeUnit(TimeUnit newWatermarkTimeUnit) { - this.watermarkColumnTimeUnit = newWatermarkTimeUnit; - return this; - } - - public Builder resolveConfig( - Table table, Map readOptions, ReadableConfig readableConfig) { - FlinkReadConf flinkReadConf = new FlinkReadConf(table, readOptions, readableConfig); - - return this.useSnapshotId(flinkReadConf.snapshotId()) - .useTag(flinkReadConf.tag()) - .useBranch(flinkReadConf.branch()) - .startTag(flinkReadConf.startTag()) - .endTag(flinkReadConf.endTag()) - .caseSensitive(flinkReadConf.caseSensitive()) - .asOfTimestamp(flinkReadConf.asOfTimestamp()) - .startingStrategy(flinkReadConf.startingStrategy()) - .startSnapshotTimestamp(flinkReadConf.startSnapshotTimestamp()) - .startSnapshotId(flinkReadConf.startSnapshotId()) - .endSnapshotId(flinkReadConf.endSnapshotId()) - .splitSize(flinkReadConf.splitSize()) - .splitLookback(flinkReadConf.splitLookback()) - .splitOpenFileCost(flinkReadConf.splitFileOpenCost()) - .streaming(flinkReadConf.streaming()) - .monitorInterval(flinkReadConf.monitorInterval()) - .nameMapping(flinkReadConf.nameMapping()) - .limit(flinkReadConf.limit()) - .planParallelism(flinkReadConf.workerPoolSize()) - .includeColumnStats(flinkReadConf.includeColumnStats()) - .maxPlanningSnapshotCount(flinkReadConf.maxPlanningSnapshotCount()) - .maxAllowedPlanningFailures(flinkReadConf.maxAllowedPlanningFailures()) - .watermarkColumn(flinkReadConf.watermarkColumn()) - .watermarkColumnTimeUnit(flinkReadConf.watermarkColumnTimeUnit()); - } - - public ScanContext build() { - return new ScanContext( - caseSensitive, - snapshotId, - startingStrategy, - startSnapshotTimestamp, - startSnapshotId, - endSnapshotId, - asOfTimestamp, - splitSize, - splitLookback, - splitOpenFileCost, - isStreaming, - monitorInterval, - nameMapping, - projectedSchema, - filters, - limit, - includeColumnStats, - includeStatsForColumns, - exposeLocality, - planParallelism, - maxPlanningSnapshotCount, - maxAllowedPlanningFailures, - watermarkColumn, - watermarkColumnTimeUnit, - branch, - tag, - startTag, - endTag); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java deleted file mode 100644 index 7c3a69dbc141..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.function.Supplier; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.table.api.config.ExecutionConfigOptions; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.hadoop.Util; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -class SourceUtil { - private SourceUtil() {} - - static boolean isLocalityEnabled( - Table table, ReadableConfig readableConfig, Boolean exposeLocality) { - Boolean localityEnabled = - exposeLocality != null - ? exposeLocality - : readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO); - - if (localityEnabled != null && !localityEnabled) { - return false; - } - - return Util.mayHaveBlockLocations(table.io(), table.location()); - } - - /** - * Infer source parallelism. - * - * @param readableConfig Flink config. - * @param splitCountProvider Split count supplier. As the computation may involve expensive split - * discover, lazy evaluation is performed if inferring parallelism is enabled. - * @param limitCount limited output count. - */ - static int inferParallelism( - ReadableConfig readableConfig, long limitCount, Supplier splitCountProvider) { - int parallelism = - readableConfig.get(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM); - if (readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM)) { - int maxInferParallelism = - readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX); - Preconditions.checkState( - maxInferParallelism >= 1, - FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX.key() - + " cannot be less than 1"); - parallelism = Math.min(splitCountProvider.get(), maxInferParallelism); - } - - if (limitCount > 0) { - int limit = limitCount >= Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) limitCount; - parallelism = Math.min(parallelism, limit); - } - - // parallelism must be positive. - parallelism = Math.max(1, parallelism); - return parallelism; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java deleted file mode 100644 index 39f615aeacc5..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java +++ /dev/null @@ -1,269 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.List; -import java.util.concurrent.ExecutorService; -import org.apache.flink.api.common.functions.RuntimeContext; -import org.apache.flink.api.common.state.ListState; -import org.apache.flink.api.common.state.ListStateDescriptor; -import org.apache.flink.api.common.typeutils.base.LongSerializer; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.runtime.state.FunctionInitializationContext; -import org.apache.flink.runtime.state.FunctionSnapshotContext; -import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; -import org.apache.flink.streaming.api.functions.source.RichSourceFunction; -import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.util.SnapshotUtil; -import org.apache.iceberg.util.ThreadPools; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * This is the single (non-parallel) monitoring task which takes a {@link FlinkInputFormat}, it is - * responsible for: - * - *

      - *
    1. Monitoring snapshots of the Iceberg table. - *
    2. Creating the {@link FlinkInputSplit splits} corresponding to the incremental files - *
    3. Assigning them to downstream tasks for further processing. - *
    - * - *

    The splits to be read are forwarded to the downstream {@link StreamingReaderOperator} which - * can have parallelism greater than one. - */ -public class StreamingMonitorFunction extends RichSourceFunction - implements CheckpointedFunction { - - private static final Logger LOG = LoggerFactory.getLogger(StreamingMonitorFunction.class); - - private static final long INIT_LAST_SNAPSHOT_ID = -1L; - - private final TableLoader tableLoader; - private final ScanContext scanContext; - - private volatile boolean isRunning = true; - - // The checkpoint thread is not the same thread that running the function for SourceStreamTask - // now. It's necessary to - // mark this as volatile. - private volatile long lastSnapshotId = INIT_LAST_SNAPSHOT_ID; - - private transient SourceContext sourceContext; - private transient Table table; - private transient ListState lastSnapshotIdState; - private transient ExecutorService workerPool; - - public StreamingMonitorFunction(TableLoader tableLoader, ScanContext scanContext) { - Preconditions.checkArgument( - scanContext.snapshotId() == null, "Cannot set snapshot-id option for streaming reader"); - Preconditions.checkArgument( - scanContext.asOfTimestamp() == null, - "Cannot set as-of-timestamp option for streaming reader"); - Preconditions.checkArgument( - scanContext.endSnapshotId() == null, - "Cannot set end-snapshot-id option for streaming reader"); - Preconditions.checkArgument( - scanContext.endTag() == null, "Cannot set end-tag option for streaming reader"); - Preconditions.checkArgument( - scanContext.maxPlanningSnapshotCount() > 0, - "The max-planning-snapshot-count must be greater than zero"); - this.tableLoader = tableLoader; - this.scanContext = scanContext; - } - - @Override - public void open(Configuration parameters) throws Exception { - super.open(parameters); - - final RuntimeContext runtimeContext = getRuntimeContext(); - ValidationException.check( - runtimeContext instanceof StreamingRuntimeContext, - "context should be instance of StreamingRuntimeContext"); - final String operatorID = ((StreamingRuntimeContext) runtimeContext).getOperatorUniqueID(); - this.workerPool = - ThreadPools.newFixedThreadPool( - "iceberg-worker-pool-" + operatorID, scanContext.planParallelism()); - } - - @Override - public void initializeState(FunctionInitializationContext context) throws Exception { - // Load iceberg table from table loader. - tableLoader.open(); - table = tableLoader.loadTable(); - - // Initialize the flink state for last snapshot id. - lastSnapshotIdState = - context - .getOperatorStateStore() - .getListState(new ListStateDescriptor<>("snapshot-id-state", LongSerializer.INSTANCE)); - - // Restore the last-snapshot-id from flink's state if possible. - if (context.isRestored()) { - LOG.info("Restoring state for the {}.", getClass().getSimpleName()); - lastSnapshotId = lastSnapshotIdState.get().iterator().next(); - } else if (scanContext.startTag() != null || scanContext.startSnapshotId() != null) { - Preconditions.checkArgument( - !(scanContext.startTag() != null && scanContext.startSnapshotId() != null), - "START_SNAPSHOT_ID and START_TAG cannot both be set."); - Preconditions.checkNotNull( - table.currentSnapshot(), "Don't have any available snapshot in table."); - - long startSnapshotId; - if (scanContext.startTag() != null) { - Preconditions.checkArgument( - table.snapshot(scanContext.startTag()) != null, - "Cannot find snapshot with tag %s in table.", - scanContext.startTag()); - startSnapshotId = table.snapshot(scanContext.startTag()).snapshotId(); - } else { - startSnapshotId = scanContext.startSnapshotId(); - } - - long currentSnapshotId = table.currentSnapshot().snapshotId(); - Preconditions.checkState( - SnapshotUtil.isAncestorOf(table, currentSnapshotId, startSnapshotId), - "The option start-snapshot-id %s is not an ancestor of the current snapshot.", - startSnapshotId); - - lastSnapshotId = startSnapshotId; - } - } - - @Override - public void snapshotState(FunctionSnapshotContext context) throws Exception { - lastSnapshotIdState.clear(); - lastSnapshotIdState.add(lastSnapshotId); - } - - @Override - public void run(SourceContext ctx) throws Exception { - this.sourceContext = ctx; - while (isRunning) { - monitorAndForwardSplits(); - Thread.sleep(scanContext.monitorInterval().toMillis()); - } - } - - private long toSnapshotIdInclusive( - long lastConsumedSnapshotId, long currentSnapshotId, int maxPlanningSnapshotCount) { - List snapshotIds = - SnapshotUtil.snapshotIdsBetween(table, lastConsumedSnapshotId, currentSnapshotId); - if (snapshotIds.size() <= maxPlanningSnapshotCount) { - return currentSnapshotId; - } else { - // It uses reverted index since snapshotIdsBetween returns Ids that are ordered by committed - // time descending. - return snapshotIds.get(snapshotIds.size() - maxPlanningSnapshotCount); - } - } - - @VisibleForTesting - void sourceContext(SourceContext ctx) { - this.sourceContext = ctx; - } - - @VisibleForTesting - void monitorAndForwardSplits() { - // Refresh the table to get the latest committed snapshot. - table.refresh(); - - Snapshot snapshot = - scanContext.branch() != null - ? table.snapshot(scanContext.branch()) - : table.currentSnapshot(); - if (snapshot != null && snapshot.snapshotId() != lastSnapshotId) { - long snapshotId = snapshot.snapshotId(); - - ScanContext newScanContext; - if (lastSnapshotId == INIT_LAST_SNAPSHOT_ID) { - newScanContext = scanContext.copyWithSnapshotId(snapshotId); - } else { - snapshotId = - toSnapshotIdInclusive( - lastSnapshotId, snapshotId, scanContext.maxPlanningSnapshotCount()); - newScanContext = scanContext.copyWithAppendsBetween(lastSnapshotId, snapshotId); - } - - LOG.debug( - "Start discovering splits from {} (exclusive) to {} (inclusive)", - lastSnapshotId, - snapshotId); - long start = System.currentTimeMillis(); - FlinkInputSplit[] splits = - FlinkSplitPlanner.planInputSplits(table, newScanContext, workerPool); - LOG.debug( - "Discovered {} splits, time elapsed {}ms", - splits.length, - System.currentTimeMillis() - start); - - // only need to hold the checkpoint lock when emitting the splits and updating lastSnapshotId - start = System.currentTimeMillis(); - synchronized (sourceContext.getCheckpointLock()) { - for (FlinkInputSplit split : splits) { - sourceContext.collect(split); - } - - lastSnapshotId = snapshotId; - } - LOG.debug( - "Forwarded {} splits, time elapsed {}ms", - splits.length, - System.currentTimeMillis() - start); - } - } - - @Override - public void cancel() { - // this is to cover the case where cancel() is called before the run() - if (sourceContext != null) { - synchronized (sourceContext.getCheckpointLock()) { - isRunning = false; - } - } else { - isRunning = false; - } - - // Release all the resources here. - if (tableLoader != null) { - try { - tableLoader.close(); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - } - - @Override - public void close() { - cancel(); - - if (workerPool != null) { - workerPool.shutdown(); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java deleted file mode 100644 index ee6f7b63988d..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.Queue; -import org.apache.flink.api.common.operators.MailboxExecutor; -import org.apache.flink.api.common.state.ListState; -import org.apache.flink.api.common.state.ListStateDescriptor; -import org.apache.flink.runtime.state.JavaSerializer; -import org.apache.flink.runtime.state.StateInitializationContext; -import org.apache.flink.runtime.state.StateSnapshotContext; -import org.apache.flink.streaming.api.functions.source.SourceFunction; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; -import org.apache.flink.streaming.api.operators.StreamOperator; -import org.apache.flink.streaming.api.operators.StreamOperatorParameters; -import org.apache.flink.streaming.api.operators.StreamSourceContexts; -import org.apache.flink.streaming.api.operators.YieldingOperatorFactory; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.streaming.runtime.tasks.ProcessingTimeService; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * The operator that reads the {@link FlinkInputSplit splits} received from the preceding {@link - * StreamingMonitorFunction}. Contrary to the {@link StreamingMonitorFunction} which has a - * parallelism of 1, this operator can have multiple parallelism. - * - *

    As soon as a split descriptor is received, it is put in a queue, and use {@link - * MailboxExecutor} read the actual data of the split. This architecture allows the separation of - * the reading thread from the one split processing the checkpoint barriers, thus removing any - * potential back-pressure. - */ -public class StreamingReaderOperator extends AbstractStreamOperator - implements OneInputStreamOperator { - - private static final Logger LOG = LoggerFactory.getLogger(StreamingReaderOperator.class); - - // It's the same thread that is running this operator and checkpoint actions. we use this executor - // to schedule only - // one split for future reading, so that a new checkpoint could be triggered without blocking long - // time for exhausting - // all scheduled splits. - private final MailboxExecutor executor; - private FlinkInputFormat format; - - private transient SourceFunction.SourceContext sourceContext; - - private transient ListState inputSplitsState; - private transient Queue splits; - - // Splits are read by the same thread that calls processElement. Each read task is submitted to - // that thread by adding - // them to the executor. This state is used to ensure that only one read task is in that queue at - // a time, so that read - // tasks do not accumulate ahead of checkpoint tasks. When there is a read task in the queue, this - // is set to RUNNING. - // When there are no more files to read, this will be set to IDLE. - private transient SplitState currentSplitState; - - private StreamingReaderOperator( - FlinkInputFormat format, ProcessingTimeService timeService, MailboxExecutor mailboxExecutor) { - this.format = Preconditions.checkNotNull(format, "The InputFormat should not be null."); - this.processingTimeService = timeService; - this.executor = - Preconditions.checkNotNull(mailboxExecutor, "The mailboxExecutor should not be null."); - } - - @Override - public void initializeState(StateInitializationContext context) throws Exception { - super.initializeState(context); - - // TODO Replace Java serialization with Avro approach to keep state compatibility. - // See issue: https://github.com/apache/iceberg/issues/1698 - inputSplitsState = - context - .getOperatorStateStore() - .getListState(new ListStateDescriptor<>("splits", new JavaSerializer<>())); - - // Initialize the current split state to IDLE. - currentSplitState = SplitState.IDLE; - - // Recover splits state from flink state backend if possible. - splits = Lists.newLinkedList(); - if (context.isRestored()) { - int subtaskIdx = getRuntimeContext().getIndexOfThisSubtask(); - LOG.info("Restoring state for the {} (taskIdx: {}).", getClass().getSimpleName(), subtaskIdx); - - for (FlinkInputSplit split : inputSplitsState.get()) { - splits.add(split); - } - } - - this.sourceContext = - StreamSourceContexts.getSourceContext( - getOperatorConfig().getTimeCharacteristic(), - getProcessingTimeService(), - new Object(), // no actual locking needed - output, - getRuntimeContext().getExecutionConfig().getAutoWatermarkInterval(), - -1, - true); - - // Enqueue to process the recovered input splits. - enqueueProcessSplits(); - } - - @Override - public void snapshotState(StateSnapshotContext context) throws Exception { - super.snapshotState(context); - - inputSplitsState.clear(); - inputSplitsState.addAll(Lists.newArrayList(splits)); - } - - @Override - public void processElement(StreamRecord element) { - splits.add(element.getValue()); - enqueueProcessSplits(); - } - - private void enqueueProcessSplits() { - if (currentSplitState == SplitState.IDLE && !splits.isEmpty()) { - currentSplitState = SplitState.RUNNING; - executor.execute(this::processSplits, this.getClass().getSimpleName()); - } - } - - private void processSplits() throws IOException { - FlinkInputSplit split = splits.poll(); - if (split == null) { - currentSplitState = SplitState.IDLE; - return; - } - - format.open(split); - try { - RowData nextElement = null; - while (!format.reachedEnd()) { - nextElement = format.nextRecord(nextElement); - sourceContext.collect(nextElement); - } - } finally { - currentSplitState = SplitState.IDLE; - format.close(); - } - - // Re-schedule to process the next split. - enqueueProcessSplits(); - } - - @Override - public void processWatermark(Watermark mark) { - // we do nothing because we emit our own watermarks if needed. - } - - @Override - public void close() throws Exception { - super.close(); - - if (format != null) { - format.close(); - format.closeInputFormat(); - format = null; - } - - sourceContext = null; - } - - @Override - public void finish() throws Exception { - super.finish(); - output.close(); - if (sourceContext != null) { - sourceContext.emitWatermark(Watermark.MAX_WATERMARK); - sourceContext.close(); - sourceContext = null; - } - } - - static OneInputStreamOperatorFactory factory(FlinkInputFormat format) { - return new OperatorFactory(format); - } - - private enum SplitState { - IDLE, - RUNNING - } - - private static class OperatorFactory extends AbstractStreamOperatorFactory - implements YieldingOperatorFactory, - OneInputStreamOperatorFactory { - - private final FlinkInputFormat format; - - private transient MailboxExecutor mailboxExecutor; - - private OperatorFactory(FlinkInputFormat format) { - this.format = format; - } - - @Override - public void setMailboxExecutor(MailboxExecutor mailboxExecutor) { - this.mailboxExecutor = mailboxExecutor; - } - - @SuppressWarnings("unchecked") - @Override - public > O createStreamOperator( - StreamOperatorParameters parameters) { - StreamingReaderOperator operator = - new StreamingReaderOperator(format, processingTimeService, mailboxExecutor); - operator.setup( - parameters.getContainingTask(), parameters.getStreamConfig(), parameters.getOutput()); - return (O) operator; - } - - @Override - public Class getStreamOperatorClass(ClassLoader classLoader) { - return StreamingReaderOperator.class; - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java deleted file mode 100644 index fbeaace20934..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -/** Starting strategy for streaming execution. */ -public enum StreamingStartingStrategy { - /** - * Do a regular table scan then switch to the incremental mode. - * - *

    The incremental mode starts from the current snapshot exclusive. - */ - TABLE_SCAN_THEN_INCREMENTAL, - - /** - * Start incremental mode from the latest snapshot inclusive. - * - *

    If it is an empty table, all future append snapshots should be discovered. - */ - INCREMENTAL_FROM_LATEST_SNAPSHOT, - - /** - * Start incremental mode from the latest snapshot exclusive. - * - *

    If it is an empty table, all future append snapshots should be discovered. - */ - INCREMENTAL_FROM_LATEST_SNAPSHOT_EXCLUSIVE, - - /** - * Start incremental mode from the earliest snapshot inclusive. - * - *

    If it is an empty table, all future append snapshots should be discovered. - */ - INCREMENTAL_FROM_EARLIEST_SNAPSHOT, - - /** Start incremental mode from a snapshot with a specific id inclusive. */ - INCREMENTAL_FROM_SNAPSHOT_ID, - - /** - * Start incremental mode from a snapshot with a specific timestamp inclusive. - * - *

    If the timestamp is between two snapshots, it should start from the snapshot after the - * timestamp. - */ - INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java deleted file mode 100644 index e7447d08c985..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import java.util.ArrayDeque; -import java.util.Collection; -import java.util.PriorityQueue; -import java.util.Queue; -import java.util.concurrent.CompletableFuture; -import java.util.stream.Collectors; -import javax.annotation.Nullable; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitStatus; -import org.apache.iceberg.flink.source.split.SerializableComparator; - -/** - * Since all methods are called in the source coordinator thread by enumerator, there is no need for - * locking. - */ -@Internal -public class DefaultSplitAssigner implements SplitAssigner { - - private final Queue pendingSplits; - private CompletableFuture availableFuture; - - public DefaultSplitAssigner(SerializableComparator comparator) { - this.pendingSplits = comparator == null ? new ArrayDeque<>() : new PriorityQueue<>(comparator); - } - - public DefaultSplitAssigner( - SerializableComparator comparator, - Collection assignerState) { - this(comparator); - // Because default assigner only tracks unassigned splits, - // there is no need to filter splits based on status (unassigned) here. - assignerState.forEach(splitState -> pendingSplits.add(splitState.split())); - } - - @Override - public synchronized GetSplitResult getNext(@Nullable String hostname) { - if (pendingSplits.isEmpty()) { - return GetSplitResult.unavailable(); - } else { - IcebergSourceSplit split = pendingSplits.poll(); - return GetSplitResult.forSplit(split); - } - } - - @Override - public void onDiscoveredSplits(Collection splits) { - addSplits(splits); - } - - @Override - public void onUnassignedSplits(Collection splits) { - addSplits(splits); - } - - private synchronized void addSplits(Collection splits) { - if (!splits.isEmpty()) { - pendingSplits.addAll(splits); - // only complete pending future if new splits are discovered - completeAvailableFuturesIfNeeded(); - } - } - - /** Simple assigner only tracks unassigned splits */ - @Override - public synchronized Collection state() { - return pendingSplits.stream() - .map(split -> new IcebergSourceSplitState(split, IcebergSourceSplitStatus.UNASSIGNED)) - .collect(Collectors.toList()); - } - - @Override - public synchronized CompletableFuture isAvailable() { - if (availableFuture == null) { - availableFuture = new CompletableFuture<>(); - } - return availableFuture; - } - - @Override - public synchronized int pendingSplitCount() { - return pendingSplits.size(); - } - - @Override - public long pendingRecords() { - return pendingSplits.stream() - .map(split -> split.task().estimatedRowsCount()) - .reduce(0L, Long::sum); - } - - private synchronized void completeAvailableFuturesIfNeeded() { - if (availableFuture != null && !pendingSplits.isEmpty()) { - availableFuture.complete(null); - } - availableFuture = null; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java deleted file mode 100644 index 36552782b6c1..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -@Internal -public class GetSplitResult { - - public enum Status { - AVAILABLE, - - /** - * There are pending splits. But they can't be assigned due to constraints (like event time - * alignment) - */ - CONSTRAINED, - - /** Assigner doesn't have pending splits. */ - UNAVAILABLE - } - - private final Status status; - private final IcebergSourceSplit split; - - private GetSplitResult(Status status) { - this.status = status; - this.split = null; - } - - private GetSplitResult(IcebergSourceSplit split) { - Preconditions.checkNotNull(split, "Split cannot be null"); - this.status = Status.AVAILABLE; - this.split = split; - } - - public Status status() { - return status; - } - - public IcebergSourceSplit split() { - return split; - } - - private static final GetSplitResult UNAVAILABLE = new GetSplitResult(Status.UNAVAILABLE); - private static final GetSplitResult CONSTRAINED = new GetSplitResult(Status.CONSTRAINED); - - public static GetSplitResult unavailable() { - return UNAVAILABLE; - } - - public static GetSplitResult constrained() { - return CONSTRAINED; - } - - public static GetSplitResult forSplit(IcebergSourceSplit split) { - return new GetSplitResult(split); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java deleted file mode 100644 index e58478897aef..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import java.util.Collection; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; -import org.apache.iceberg.flink.source.split.SerializableComparator; - -/** - * Create default assigner with a comparator that hands out splits where the order of the splits - * will be defined by the {@link SerializableComparator}. - */ -public class OrderedSplitAssignerFactory implements SplitAssignerFactory { - private final SerializableComparator comparator; - - public OrderedSplitAssignerFactory(SerializableComparator comparator) { - this.comparator = comparator; - } - - @Override - public SplitAssigner createAssigner() { - return new DefaultSplitAssigner(comparator); - } - - @Override - public SplitAssigner createAssigner(Collection assignerState) { - return new DefaultSplitAssigner(comparator, assignerState); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java deleted file mode 100644 index a2e2ff364d46..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import java.util.Collection; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; - -/** Create simple assigner that hands out splits without any guarantee in order or locality. */ -public class SimpleSplitAssignerFactory implements SplitAssignerFactory { - public SimpleSplitAssignerFactory() {} - - @Override - public SplitAssigner createAssigner() { - return new DefaultSplitAssigner(null); - } - - @Override - public SplitAssigner createAssigner(Collection assignerState) { - return new DefaultSplitAssigner(null, assignerState); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java deleted file mode 100644 index dae7c8cca70c..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import java.io.Closeable; -import java.util.Collection; -import java.util.concurrent.CompletableFuture; -import javax.annotation.Nullable; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.iceberg.flink.source.ScanContext; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; - -/** - * SplitAssigner interface is extracted out as a separate component so that we can plug in different - * split assignment strategy for different requirements. E.g. - * - *

      - *
    • Simple assigner with no ordering guarantee or locality aware optimization. - *
    • Locality aware assigner that prefer splits that are local. - *
    • Snapshot aware assigner that assign splits based on the order they are committed. - *
    • Event time alignment assigner that assign splits satisfying certain time ordering within a - * single source or across sources. - *
    - * - *

    Assigner implementation needs to be thread safe. Enumerator call the assigner APIs mostly from - * the coordinator thread. But enumerator may call the {@link SplitAssigner#pendingSplitCount()} - * from the I/O threads. - */ -public interface SplitAssigner extends Closeable { - - /** - * Some assigners may need to start background threads or perform other activity such as - * registering as listeners to updates from other event sources e.g., watermark tracker. - */ - default void start() {} - - /** - * Some assigners may need to perform certain actions when their corresponding enumerators are - * closed - */ - @Override - default void close() {} - - /** - * Request a new split from the assigner when enumerator trying to assign splits to awaiting - * readers. - * - *

    If enumerator wasn't able to assign the split (e.g., reader disconnected), enumerator should - * call {@link SplitAssigner#onUnassignedSplits} to return the split. - */ - GetSplitResult getNext(@Nullable String hostname); - - /** Add new splits discovered by enumerator */ - void onDiscoveredSplits(Collection splits); - - /** Forward addSplitsBack event (for failed reader) to assigner */ - void onUnassignedSplits(Collection splits); - - /** - * Some assigner (like event time alignment) may rack in-progress splits to advance watermark upon - * completed splits - */ - default void onCompletedSplits(Collection completedSplitIds) {} - - /** - * Get assigner state for checkpointing. This is a super-set API that works for all currently - * imagined assigners. - */ - Collection state(); - - /** - * Enumerator can get a notification via CompletableFuture when the assigner has more splits - * available later. Enumerator should schedule assignment in the thenAccept action of the future. - * - *

    Assigner will return the same future if this method is called again before the previous - * future is completed. - * - *

    The future can be completed from other thread, e.g. the coordinator thread from another - * thread for event time alignment. - * - *

    If enumerator need to trigger action upon the future completion, it may want to run it in - * the coordinator thread using {@link SplitEnumeratorContext#runInCoordinatorThread(Runnable)}. - */ - CompletableFuture isAvailable(); - - /** - * Return the number of pending splits that haven't been assigned yet. - * - *

    The enumerator can poll this API to publish a metric on the number of pending splits. - * - *

    The enumerator can also use this information to throttle split discovery for streaming read. - * If there are already many pending splits tracked by the assigner, it is undesirable to discover - * more splits and track them in the assigner. That will increase the memory footprint and - * enumerator checkpoint size. - * - *

    Throttling works better together with {@link ScanContext#maxPlanningSnapshotCount()}. - * Otherwise, the next split discovery after throttling will just discover all non-enumerated - * snapshots and splits, which defeats the purpose of throttling. - */ - int pendingSplitCount(); - - /** - * Return the number of pending records, which can act as a measure of the source lag. This value - * could be an estimation if the exact number of records cannot be accurately computed. - */ - long pendingRecords(); -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java deleted file mode 100644 index 6e02a556ffcd..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import java.io.Serializable; -import java.util.Collection; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; - -public interface SplitAssignerFactory extends Serializable { - - SplitAssigner createAssigner(); - - SplitAssigner createAssigner(Collection assignerState); -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java deleted file mode 100644 index 03ba67a554f9..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import org.apache.flink.annotation.Internal; - -@Internal -public enum SplitAssignerType { - SIMPLE { - @Override - public SplitAssignerFactory factory() { - return new SimpleSplitAssignerFactory(); - } - }; - - public abstract SplitAssignerFactory factory(); -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java deleted file mode 100644 index fc310606dee9..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.io.IOException; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.atomic.AtomicReference; -import javax.annotation.Nullable; -import org.apache.flink.api.connector.source.SourceEvent; -import org.apache.flink.api.connector.source.SplitEnumerator; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.flink.api.connector.source.SupportsHandleExecutionAttemptSourceEvent; -import org.apache.iceberg.flink.source.assigner.GetSplitResult; -import org.apache.iceberg.flink.source.assigner.SplitAssigner; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.SplitRequestEvent; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -abstract class AbstractIcebergEnumerator - implements SplitEnumerator, - SupportsHandleExecutionAttemptSourceEvent { - private static final Logger LOG = LoggerFactory.getLogger(AbstractIcebergEnumerator.class); - - private final SplitEnumeratorContext enumeratorContext; - private final SplitAssigner assigner; - private final Map readersAwaitingSplit; - private final AtomicReference> availableFuture; - - AbstractIcebergEnumerator( - SplitEnumeratorContext enumeratorContext, SplitAssigner assigner) { - this.enumeratorContext = enumeratorContext; - this.assigner = assigner; - this.readersAwaitingSplit = new LinkedHashMap<>(); - this.availableFuture = new AtomicReference<>(); - this.enumeratorContext - .metricGroup() - // This number may not capture the entire backlog due to split discovery throttling to avoid - // excessive memory footprint. Some pending splits may not have been discovered yet. - .setUnassignedSplitsGauge(() -> Long.valueOf(assigner.pendingSplitCount())); - this.enumeratorContext.metricGroup().gauge("pendingRecords", assigner::pendingRecords); - } - - @Override - public void start() { - assigner.start(); - } - - @Override - public void close() throws IOException { - assigner.close(); - } - - @Override - public void handleSplitRequest(int subtaskId, @Nullable String requesterHostname) { - // Iceberg source uses custom split request event to piggyback finished split ids. - throw new UnsupportedOperationException( - String.format( - Locale.ROOT, - "Received invalid default split request event " - + "from subtask %d as Iceberg source uses custom split request event", - subtaskId)); - } - - @Override - public void handleSourceEvent(int subtaskId, SourceEvent sourceEvent) { - if (sourceEvent instanceof SplitRequestEvent) { - SplitRequestEvent splitRequestEvent = (SplitRequestEvent) sourceEvent; - LOG.info("Received request split event from subtask {}", subtaskId); - assigner.onCompletedSplits(splitRequestEvent.finishedSplitIds()); - readersAwaitingSplit.put(subtaskId, splitRequestEvent.requesterHostname()); - assignSplits(); - } else { - throw new IllegalArgumentException( - String.format( - Locale.ROOT, - "Received unknown event from subtask %d: %s", - subtaskId, - sourceEvent.getClass().getCanonicalName())); - } - } - - // Flink's SourceCoordinator already keeps track of subTask to splits mapping. - // It already takes care of re-assigning splits to speculated attempts as well. - @Override - public void handleSourceEvent(int subTaskId, int attemptNumber, SourceEvent sourceEvent) { - handleSourceEvent(subTaskId, sourceEvent); - } - - @Override - public void addSplitsBack(List splits, int subtaskId) { - LOG.info("Add {} splits back to the pool for failed subtask {}", splits.size(), subtaskId); - assigner.onUnassignedSplits(splits); - assignSplits(); - } - - @Override - public void addReader(int subtaskId) { - LOG.info("Added reader: {}", subtaskId); - } - - private void assignSplits() { - LOG.info("Assigning splits for {} awaiting readers", readersAwaitingSplit.size()); - Iterator> awaitingReader = - readersAwaitingSplit.entrySet().iterator(); - while (awaitingReader.hasNext()) { - Map.Entry nextAwaiting = awaitingReader.next(); - // if the reader that requested another split has failed in the meantime, remove - // it from the list of waiting readers - if (!enumeratorContext.registeredReaders().containsKey(nextAwaiting.getKey())) { - awaitingReader.remove(); - continue; - } - - int awaitingSubtask = nextAwaiting.getKey(); - String hostname = nextAwaiting.getValue(); - GetSplitResult getResult = assigner.getNext(hostname); - if (getResult.status() == GetSplitResult.Status.AVAILABLE) { - LOG.info("Assign split to subtask {}: {}", awaitingSubtask, getResult.split()); - enumeratorContext.assignSplit(getResult.split(), awaitingSubtask); - awaitingReader.remove(); - } else if (getResult.status() == GetSplitResult.Status.CONSTRAINED) { - getAvailableFutureIfNeeded(); - break; - } else if (getResult.status() == GetSplitResult.Status.UNAVAILABLE) { - if (shouldWaitForMoreSplits()) { - getAvailableFutureIfNeeded(); - break; - } else { - LOG.info("No more splits available for subtask {}", awaitingSubtask); - enumeratorContext.signalNoMoreSplits(awaitingSubtask); - awaitingReader.remove(); - } - } else { - throw new IllegalArgumentException("Unsupported status: " + getResult.status()); - } - } - } - - /** return true if enumerator should wait for splits like in the continuous enumerator case */ - protected abstract boolean shouldWaitForMoreSplits(); - - private synchronized void getAvailableFutureIfNeeded() { - if (availableFuture.get() != null) { - return; - } - - CompletableFuture future = - assigner - .isAvailable() - .thenAccept( - ignore -> - // Must run assignSplits in coordinator thread - // because the future may be completed from other threads. - // E.g., in event time alignment assigner, - // watermark advancement from another source may - // cause the available future to be completed - enumeratorContext.runInCoordinatorThread( - () -> { - LOG.debug("Executing callback of assignSplits"); - availableFuture.set(null); - assignSplits(); - })); - availableFuture.set(future); - LOG.debug("Registered callback for future available splits"); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java deleted file mode 100644 index 41863ffee60b..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.util.Collection; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -class ContinuousEnumerationResult { - private final Collection splits; - private final IcebergEnumeratorPosition fromPosition; - private final IcebergEnumeratorPosition toPosition; - - /** - * @param splits should never be null. But it can be an empty collection - * @param fromPosition can be null - * @param toPosition should never be null. But it can have null snapshotId and snapshotTimestampMs - */ - ContinuousEnumerationResult( - Collection splits, - IcebergEnumeratorPosition fromPosition, - IcebergEnumeratorPosition toPosition) { - Preconditions.checkArgument(splits != null, "Invalid to splits collection: null"); - Preconditions.checkArgument(toPosition != null, "Invalid end position: null"); - this.splits = splits; - this.fromPosition = fromPosition; - this.toPosition = toPosition; - } - - public Collection splits() { - return splits; - } - - public IcebergEnumeratorPosition fromPosition() { - return fromPosition; - } - - public IcebergEnumeratorPosition toPosition() { - return toPosition; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java deleted file mode 100644 index c50c3854ee14..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.io.IOException; -import java.util.Collections; -import java.util.Objects; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicReference; -import javax.annotation.Nullable; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.iceberg.flink.source.ScanContext; -import org.apache.iceberg.flink.source.assigner.SplitAssigner; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.util.ElapsedTimeGauge; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -@Internal -public class ContinuousIcebergEnumerator extends AbstractIcebergEnumerator { - - private static final Logger LOG = LoggerFactory.getLogger(ContinuousIcebergEnumerator.class); - - /** - * This is hardcoded, as {@link ScanContext#maxPlanningSnapshotCount()} could be the knob to - * control the total number of snapshots worth of splits tracked by assigner. - */ - private static final int ENUMERATION_SPLIT_COUNT_HISTORY_SIZE = 3; - - private final SplitEnumeratorContext enumeratorContext; - private final SplitAssigner assigner; - private final ScanContext scanContext; - private final ContinuousSplitPlanner splitPlanner; - - /** - * snapshotId for the last enumerated snapshot. next incremental enumeration should be based off - * this as the starting position. - */ - private final AtomicReference enumeratorPosition; - - /** Track enumeration result history for split discovery throttling. */ - private final EnumerationHistory enumerationHistory; - - /** Count the consecutive failures and throw exception if the max allowed failres are reached */ - private transient int consecutiveFailures = 0; - - private final ElapsedTimeGauge elapsedSecondsSinceLastSplitDiscovery; - - public ContinuousIcebergEnumerator( - SplitEnumeratorContext enumeratorContext, - SplitAssigner assigner, - ScanContext scanContext, - ContinuousSplitPlanner splitPlanner, - @Nullable IcebergEnumeratorState enumState) { - super(enumeratorContext, assigner); - - this.enumeratorContext = enumeratorContext; - this.assigner = assigner; - this.scanContext = scanContext; - this.splitPlanner = splitPlanner; - this.enumeratorPosition = new AtomicReference<>(); - this.enumerationHistory = new EnumerationHistory(ENUMERATION_SPLIT_COUNT_HISTORY_SIZE); - this.elapsedSecondsSinceLastSplitDiscovery = new ElapsedTimeGauge(TimeUnit.SECONDS); - this.enumeratorContext - .metricGroup() - .gauge("elapsedSecondsSinceLastSplitDiscovery", elapsedSecondsSinceLastSplitDiscovery); - - if (enumState != null) { - this.enumeratorPosition.set(enumState.lastEnumeratedPosition()); - this.enumerationHistory.restore(enumState.enumerationSplitCountHistory()); - } - } - - @Override - public void start() { - super.start(); - enumeratorContext.callAsync( - this::discoverSplits, - this::processDiscoveredSplits, - 0L, - scanContext.monitorInterval().toMillis()); - } - - @Override - public void close() throws IOException { - splitPlanner.close(); - super.close(); - } - - @Override - protected boolean shouldWaitForMoreSplits() { - return true; - } - - @Override - public IcebergEnumeratorState snapshotState(long checkpointId) { - return new IcebergEnumeratorState( - enumeratorPosition.get(), assigner.state(), enumerationHistory.snapshot()); - } - - /** This method is executed in an IO thread pool. */ - private ContinuousEnumerationResult discoverSplits() { - int pendingSplitCountFromAssigner = assigner.pendingSplitCount(); - if (enumerationHistory.shouldPauseSplitDiscovery(pendingSplitCountFromAssigner)) { - // If the assigner already has many pending splits, it is better to pause split discovery. - // Otherwise, eagerly discovering more splits will just increase assigner memory footprint - // and enumerator checkpoint state size. - LOG.info( - "Pause split discovery as the assigner already has too many pending splits: {}", - pendingSplitCountFromAssigner); - return new ContinuousEnumerationResult( - Collections.emptyList(), enumeratorPosition.get(), enumeratorPosition.get()); - } else { - return splitPlanner.planSplits(enumeratorPosition.get()); - } - } - - /** This method is executed in a single coordinator thread. */ - private void processDiscoveredSplits(ContinuousEnumerationResult result, Throwable error) { - if (error == null) { - consecutiveFailures = 0; - if (!Objects.equals(result.fromPosition(), enumeratorPosition.get())) { - // Multiple discoverSplits() may be triggered with the same starting snapshot to the I/O - // thread pool. E.g., the splitDiscoveryInterval is very short (like 10 ms in some unit - // tests) or the thread pool is busy and multiple discovery actions are executed - // concurrently. Discovery result should only be accepted if the starting position - // matches the enumerator position (like compare-and-swap). - LOG.info( - "Skip {} discovered splits because the scan starting position doesn't match " - + "the current enumerator position: enumerator position = {}, scan starting position = {}", - result.splits().size(), - enumeratorPosition.get(), - result.fromPosition()); - } else { - elapsedSecondsSinceLastSplitDiscovery.refreshLastRecordedTime(); - // Sometimes, enumeration may yield no splits for a few reasons. - // - upstream paused or delayed streaming writes to the Iceberg table. - // - enumeration frequency is higher than the upstream write frequency. - if (!result.splits().isEmpty()) { - assigner.onDiscoveredSplits(result.splits()); - // EnumerationHistory makes throttling decision on split discovery - // based on the total number of splits discovered in the last a few cycles. - // Only update enumeration history when there are some discovered splits. - enumerationHistory.add(result.splits().size()); - LOG.info( - "Added {} splits discovered between ({}, {}] to the assigner", - result.splits().size(), - result.fromPosition(), - result.toPosition()); - } else { - LOG.info( - "No new splits discovered between ({}, {}]", - result.fromPosition(), - result.toPosition()); - } - // update the enumerator position even if there is no split discovered - // or the toPosition is empty (e.g. for empty table). - enumeratorPosition.set(result.toPosition()); - LOG.info("Update enumerator position to {}", result.toPosition()); - } - } else { - consecutiveFailures++; - if (scanContext.maxAllowedPlanningFailures() < 0 - || consecutiveFailures <= scanContext.maxAllowedPlanningFailures()) { - LOG.error("Failed to discover new splits", error); - } else { - throw new RuntimeException("Failed to discover new splits", error); - } - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java deleted file mode 100644 index 2a1325178873..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.io.Closeable; -import org.apache.flink.annotation.Internal; - -/** This interface is introduced so that we can plug in different split planner for unit test */ -@Internal -public interface ContinuousSplitPlanner extends Closeable { - - /** Discover the files appended between {@code lastPosition} and current table snapshot */ - ContinuousEnumerationResult planSplits(IcebergEnumeratorPosition lastPosition); -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java deleted file mode 100644 index e8478b8ea89d..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.Optional; -import java.util.concurrent.ExecutorService; -import org.apache.flink.annotation.Internal; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.source.FlinkSplitPlanner; -import org.apache.iceberg.flink.source.ScanContext; -import org.apache.iceberg.flink.source.StreamingStartingStrategy; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.SnapshotUtil; -import org.apache.iceberg.util.ThreadPools; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -@Internal -public class ContinuousSplitPlannerImpl implements ContinuousSplitPlanner { - private static final Logger LOG = LoggerFactory.getLogger(ContinuousSplitPlannerImpl.class); - - private final Table table; - private final ScanContext scanContext; - private final boolean isSharedPool; - private final ExecutorService workerPool; - private final TableLoader tableLoader; - - /** - * @param tableLoader A cloned tableLoader. - * @param threadName thread name prefix for worker pool to run the split planning. If null, a - * shared worker pool will be used. - */ - public ContinuousSplitPlannerImpl( - TableLoader tableLoader, ScanContext scanContext, String threadName) { - this.tableLoader = tableLoader.clone(); - this.tableLoader.open(); - this.table = this.tableLoader.loadTable(); - this.scanContext = scanContext; - this.isSharedPool = threadName == null; - this.workerPool = - isSharedPool - ? ThreadPools.getWorkerPool() - : ThreadPools.newFixedThreadPool( - "iceberg-plan-worker-pool-" + threadName, scanContext.planParallelism()); - } - - @Override - public void close() throws IOException { - if (!isSharedPool) { - workerPool.shutdown(); - } - tableLoader.close(); - } - - @Override - public ContinuousEnumerationResult planSplits(IcebergEnumeratorPosition lastPosition) { - table.refresh(); - if (lastPosition != null) { - return discoverIncrementalSplits(lastPosition); - } else { - return discoverInitialSplits(); - } - } - - private Snapshot toSnapshotInclusive( - Long lastConsumedSnapshotId, Snapshot currentSnapshot, int maxPlanningSnapshotCount) { - // snapshots are in reverse order (latest snapshot first) - List snapshots = - Lists.newArrayList( - SnapshotUtil.ancestorsBetween( - table, currentSnapshot.snapshotId(), lastConsumedSnapshotId)); - if (snapshots.size() <= maxPlanningSnapshotCount) { - return currentSnapshot; - } else { - // Because snapshots are in reverse order of commit history, this index returns - // the max allowed number of snapshots from the lastConsumedSnapshotId. - return snapshots.get(snapshots.size() - maxPlanningSnapshotCount); - } - } - - private ContinuousEnumerationResult discoverIncrementalSplits( - IcebergEnumeratorPosition lastPosition) { - Snapshot currentSnapshot = - scanContext.branch() != null - ? table.snapshot(scanContext.branch()) - : table.currentSnapshot(); - - if (currentSnapshot == null) { - // empty table - Preconditions.checkArgument( - lastPosition.snapshotId() == null, - "Invalid last enumerated position for an empty table: not null"); - LOG.info("Skip incremental scan because table is empty"); - return new ContinuousEnumerationResult(Collections.emptyList(), lastPosition, lastPosition); - } else if (lastPosition.snapshotId() != null - && currentSnapshot.snapshotId() == lastPosition.snapshotId()) { - LOG.info("Current table snapshot is already enumerated: {}", currentSnapshot.snapshotId()); - return new ContinuousEnumerationResult(Collections.emptyList(), lastPosition, lastPosition); - } else { - Long lastConsumedSnapshotId = lastPosition.snapshotId(); - Snapshot toSnapshotInclusive = - toSnapshotInclusive( - lastConsumedSnapshotId, currentSnapshot, scanContext.maxPlanningSnapshotCount()); - IcebergEnumeratorPosition newPosition = - IcebergEnumeratorPosition.of( - toSnapshotInclusive.snapshotId(), toSnapshotInclusive.timestampMillis()); - ScanContext incrementalScan = - scanContext.copyWithAppendsBetween( - lastPosition.snapshotId(), toSnapshotInclusive.snapshotId()); - List splits = - FlinkSplitPlanner.planIcebergSourceSplits(table, incrementalScan, workerPool); - LOG.info( - "Discovered {} splits from incremental scan: " - + "from snapshot (exclusive) is {}, to snapshot (inclusive) is {}", - splits.size(), - lastPosition, - newPosition); - return new ContinuousEnumerationResult(splits, lastPosition, newPosition); - } - } - - /** - * Discovery initial set of splits based on {@link StreamingStartingStrategy}. - *

  • {@link ContinuousEnumerationResult#splits()} should contain initial splits discovered from - * table scan for {@link StreamingStartingStrategy#TABLE_SCAN_THEN_INCREMENTAL}. For all other - * strategies, splits collection should be empty. - *
  • {@link ContinuousEnumerationResult#toPosition()} points to the starting position for the - * next incremental split discovery with exclusive behavior. Meaning files committed by the - * snapshot from the position in {@code ContinuousEnumerationResult} won't be included in the - * next incremental scan. - */ - private ContinuousEnumerationResult discoverInitialSplits() { - Optional startSnapshotOptional = startSnapshot(table, scanContext); - if (!startSnapshotOptional.isPresent()) { - return new ContinuousEnumerationResult( - Collections.emptyList(), null, IcebergEnumeratorPosition.empty()); - } - - Snapshot startSnapshot = startSnapshotOptional.get(); - LOG.info( - "Get starting snapshot id {} based on strategy {}", - startSnapshot.snapshotId(), - scanContext.streamingStartingStrategy()); - List splits = Collections.emptyList(); - IcebergEnumeratorPosition toPosition; - if (scanContext.streamingStartingStrategy() - == StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) { - // do a batch table scan first - splits = - FlinkSplitPlanner.planIcebergSourceSplits( - table, scanContext.copyWithSnapshotId(startSnapshot.snapshotId()), workerPool); - LOG.info( - "Discovered {} splits from initial batch table scan with snapshot Id {}", - splits.size(), - startSnapshot.snapshotId()); - // For TABLE_SCAN_THEN_INCREMENTAL, incremental mode starts exclusive from the startSnapshot - toPosition = - IcebergEnumeratorPosition.of(startSnapshot.snapshotId(), startSnapshot.timestampMillis()); - } else if (scanContext.streamingStartingStrategy() - == StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT_EXCLUSIVE) { - toPosition = - IcebergEnumeratorPosition.of(startSnapshot.snapshotId(), startSnapshot.timestampMillis()); - LOG.info( - "Start incremental scan with start snapshot (exclusive): id = {}, timestamp = {}", - startSnapshot.snapshotId(), - startSnapshot.timestampMillis()); - } else { - // For all other modes, starting snapshot should be consumed inclusively. - // Use parentId to achieve the inclusive behavior. It is fine if parentId is null. - Long parentSnapshotId = startSnapshot.parentId(); - if (parentSnapshotId != null) { - Snapshot parentSnapshot = table.snapshot(parentSnapshotId); - Long parentSnapshotTimestampMs = - parentSnapshot != null ? parentSnapshot.timestampMillis() : null; - toPosition = IcebergEnumeratorPosition.of(parentSnapshotId, parentSnapshotTimestampMs); - } else { - toPosition = IcebergEnumeratorPosition.empty(); - } - - LOG.info( - "Start incremental scan with start snapshot (inclusive): id = {}, timestamp = {}", - startSnapshot.snapshotId(), - startSnapshot.timestampMillis()); - } - - return new ContinuousEnumerationResult(splits, null, toPosition); - } - - /** - * Calculate the starting snapshot based on the {@link StreamingStartingStrategy} defined in - * {@code ScanContext}. - * - *

    If the {@link StreamingStartingStrategy} is not {@link - * StreamingStartingStrategy#TABLE_SCAN_THEN_INCREMENTAL}, the start snapshot should be consumed - * inclusively. - */ - @VisibleForTesting - static Optional startSnapshot(Table table, ScanContext scanContext) { - switch (scanContext.streamingStartingStrategy()) { - case TABLE_SCAN_THEN_INCREMENTAL: - case INCREMENTAL_FROM_LATEST_SNAPSHOT: - case INCREMENTAL_FROM_LATEST_SNAPSHOT_EXCLUSIVE: - return Optional.ofNullable(table.currentSnapshot()); - case INCREMENTAL_FROM_EARLIEST_SNAPSHOT: - return Optional.ofNullable(SnapshotUtil.oldestAncestor(table)); - case INCREMENTAL_FROM_SNAPSHOT_ID: - Snapshot matchedSnapshotById = table.snapshot(scanContext.startSnapshotId()); - Preconditions.checkArgument( - matchedSnapshotById != null, - "Start snapshot id not found in history: " + scanContext.startSnapshotId()); - return Optional.of(matchedSnapshotById); - case INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP: - Snapshot matchedSnapshotByTimestamp = - SnapshotUtil.oldestAncestorAfter(table, scanContext.startSnapshotTimestamp()); - Preconditions.checkArgument( - matchedSnapshotByTimestamp != null, - "Cannot find a snapshot after: " + scanContext.startSnapshotTimestamp()); - return Optional.of(matchedSnapshotByTimestamp); - default: - throw new IllegalArgumentException( - "Unknown starting strategy: " + scanContext.streamingStartingStrategy()); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java deleted file mode 100644 index ec56a9ecdac1..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.util.Arrays; -import javax.annotation.concurrent.ThreadSafe; -import org.apache.flink.annotation.VisibleForTesting; - -/** - * This enumeration history is used for split discovery throttling. It tracks the discovered split - * count per every non-empty enumeration. - */ -@ThreadSafe -class EnumerationHistory { - - private final int[] history; - // int (2B) should be enough without overflow for enumeration history - private int count; - - EnumerationHistory(int maxHistorySize) { - this.history = new int[maxHistorySize]; - } - - synchronized void restore(int[] restoredHistory) { - int startingOffset = 0; - int restoreSize = restoredHistory.length; - - if (restoredHistory.length > history.length) { - // keep the newest history - startingOffset = restoredHistory.length - history.length; - // only restore the latest history up to maxHistorySize - restoreSize = history.length; - } - - System.arraycopy(restoredHistory, startingOffset, history, 0, restoreSize); - count = restoreSize; - } - - synchronized int[] snapshot() { - int len = history.length; - if (count > len) { - int[] copy = new int[len]; - // this is like a circular buffer - int indexForOldest = count % len; - System.arraycopy(history, indexForOldest, copy, 0, len - indexForOldest); - System.arraycopy(history, 0, copy, len - indexForOldest, indexForOldest); - return copy; - } else { - return Arrays.copyOfRange(history, 0, count); - } - } - - /** Add the split count from the last enumeration result. */ - synchronized void add(int splitCount) { - int pos = count % history.length; - history[pos] = splitCount; - count += 1; - } - - @VisibleForTesting - synchronized boolean hasFullHistory() { - return count >= history.length; - } - - /** - * Checks whether split discovery should be paused. - * - * @return true if split discovery should pause because assigner has too many splits already. - */ - synchronized boolean shouldPauseSplitDiscovery(int pendingSplitCountFromAssigner) { - if (count < history.length) { - // only check throttling when full history is obtained. - return false; - } else { - // if ScanContext#maxPlanningSnapshotCount() is 10, each split enumeration can - // discovery splits up to 10 snapshots. if maxHistorySize is 3, the max number of - // splits tracked in assigner shouldn't be more than 10 * (3 + 1) snapshots - // worth of splits. +1 because there could be another enumeration when the - // pending splits fall just below the 10 * 3. - int totalSplitCountFromRecentDiscovery = Arrays.stream(history).reduce(0, Integer::sum); - return pendingSplitCountFromAssigner >= totalSplitCountFromRecentDiscovery; - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java deleted file mode 100644 index 96aba296f8cf..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Objects; - -class IcebergEnumeratorPosition { - private final Long snapshotId; - // Track snapshot timestamp mainly for info logging - private final Long snapshotTimestampMs; - - static IcebergEnumeratorPosition empty() { - return new IcebergEnumeratorPosition(null, null); - } - - static IcebergEnumeratorPosition of(long snapshotId, Long snapshotTimestampMs) { - return new IcebergEnumeratorPosition(snapshotId, snapshotTimestampMs); - } - - private IcebergEnumeratorPosition(Long snapshotId, Long snapshotTimestampMs) { - this.snapshotId = snapshotId; - this.snapshotTimestampMs = snapshotTimestampMs; - } - - boolean isEmpty() { - return snapshotId == null; - } - - Long snapshotId() { - return snapshotId; - } - - Long snapshotTimestampMs() { - return snapshotTimestampMs; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("snapshotId", snapshotId) - .add("snapshotTimestampMs", snapshotTimestampMs) - .toString(); - } - - @Override - public int hashCode() { - return Objects.hashCode(snapshotId, snapshotTimestampMs); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - IcebergEnumeratorPosition other = (IcebergEnumeratorPosition) o; - return Objects.equal(snapshotId, other.snapshotId()) - && Objects.equal(snapshotTimestampMs, other.snapshotTimestampMs()); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java deleted file mode 100644 index 1c63807361c5..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.io.IOException; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; - -class IcebergEnumeratorPositionSerializer - implements SimpleVersionedSerializer { - - public static final IcebergEnumeratorPositionSerializer INSTANCE = - new IcebergEnumeratorPositionSerializer(); - - private static final int VERSION = 1; - - private static final ThreadLocal SERIALIZER_CACHE = - ThreadLocal.withInitial(() -> new DataOutputSerializer(128)); - - @Override - public int getVersion() { - return VERSION; - } - - @Override - public byte[] serialize(IcebergEnumeratorPosition position) throws IOException { - return serializeV1(position); - } - - @Override - public IcebergEnumeratorPosition deserialize(int version, byte[] serialized) throws IOException { - switch (version) { - case 1: - return deserializeV1(serialized); - default: - throw new IOException("Unknown version: " + version); - } - } - - private byte[] serializeV1(IcebergEnumeratorPosition position) throws IOException { - DataOutputSerializer out = SERIALIZER_CACHE.get(); - out.writeBoolean(position.snapshotId() != null); - if (position.snapshotId() != null) { - out.writeLong(position.snapshotId()); - } - out.writeBoolean(position.snapshotTimestampMs() != null); - if (position.snapshotTimestampMs() != null) { - out.writeLong(position.snapshotTimestampMs()); - } - byte[] result = out.getCopyOfBuffer(); - out.clear(); - return result; - } - - private IcebergEnumeratorPosition deserializeV1(byte[] serialized) throws IOException { - DataInputDeserializer in = new DataInputDeserializer(serialized); - Long snapshotId = null; - if (in.readBoolean()) { - snapshotId = in.readLong(); - } - - Long snapshotTimestampMs = null; - if (in.readBoolean()) { - snapshotTimestampMs = in.readLong(); - } - - if (snapshotId != null) { - return IcebergEnumeratorPosition.of(snapshotId, snapshotTimestampMs); - } else { - return IcebergEnumeratorPosition.empty(); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java deleted file mode 100644 index 26fbad46c128..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.io.Serializable; -import java.util.Collection; -import javax.annotation.Nullable; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; - -/** Enumerator state for checkpointing */ -@Internal -public class IcebergEnumeratorState implements Serializable { - @Nullable private final IcebergEnumeratorPosition lastEnumeratedPosition; - private final Collection pendingSplits; - private final int[] enumerationSplitCountHistory; - - public IcebergEnumeratorState(Collection pendingSplits) { - this(null, pendingSplits); - } - - public IcebergEnumeratorState( - @Nullable IcebergEnumeratorPosition lastEnumeratedPosition, - Collection pendingSplits) { - this(lastEnumeratedPosition, pendingSplits, new int[0]); - } - - public IcebergEnumeratorState( - @Nullable IcebergEnumeratorPosition lastEnumeratedPosition, - Collection pendingSplits, - int[] enumerationSplitCountHistory) { - this.lastEnumeratedPosition = lastEnumeratedPosition; - this.pendingSplits = pendingSplits; - this.enumerationSplitCountHistory = enumerationSplitCountHistory; - } - - @Nullable - public IcebergEnumeratorPosition lastEnumeratedPosition() { - return lastEnumeratedPosition; - } - - public Collection pendingSplits() { - return pendingSplits; - } - - public int[] enumerationSplitCountHistory() { - return enumerationSplitCountHistory; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java deleted file mode 100644 index f76f8a69ff0e..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.io.IOException; -import java.util.Collection; -import org.apache.flink.annotation.Internal; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitSerializer; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitStatus; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -@Internal -public class IcebergEnumeratorStateSerializer - implements SimpleVersionedSerializer { - - private static final int VERSION = 2; - - private static final ThreadLocal SERIALIZER_CACHE = - ThreadLocal.withInitial(() -> new DataOutputSerializer(1024)); - - private final IcebergEnumeratorPositionSerializer positionSerializer = - IcebergEnumeratorPositionSerializer.INSTANCE; - private final IcebergSourceSplitSerializer splitSerializer; - - public IcebergEnumeratorStateSerializer(boolean caseSensitive) { - this.splitSerializer = new IcebergSourceSplitSerializer(caseSensitive); - } - - @Override - public int getVersion() { - return VERSION; - } - - @Override - public byte[] serialize(IcebergEnumeratorState enumState) throws IOException { - return serializeV2(enumState); - } - - @Override - public IcebergEnumeratorState deserialize(int version, byte[] serialized) throws IOException { - switch (version) { - case 1: - return deserializeV1(serialized); - case 2: - return deserializeV2(serialized); - default: - throw new IOException("Unknown version: " + version); - } - } - - @VisibleForTesting - byte[] serializeV1(IcebergEnumeratorState enumState) throws IOException { - DataOutputSerializer out = SERIALIZER_CACHE.get(); - serializeEnumeratorPosition(out, enumState.lastEnumeratedPosition(), positionSerializer); - serializePendingSplits(out, enumState.pendingSplits(), splitSerializer); - byte[] result = out.getCopyOfBuffer(); - out.clear(); - return result; - } - - @VisibleForTesting - IcebergEnumeratorState deserializeV1(byte[] serialized) throws IOException { - DataInputDeserializer in = new DataInputDeserializer(serialized); - IcebergEnumeratorPosition enumeratorPosition = - deserializeEnumeratorPosition(in, positionSerializer); - Collection pendingSplits = - deserializePendingSplits(in, splitSerializer); - return new IcebergEnumeratorState(enumeratorPosition, pendingSplits); - } - - @VisibleForTesting - byte[] serializeV2(IcebergEnumeratorState enumState) throws IOException { - DataOutputSerializer out = SERIALIZER_CACHE.get(); - serializeEnumeratorPosition(out, enumState.lastEnumeratedPosition(), positionSerializer); - serializePendingSplits(out, enumState.pendingSplits(), splitSerializer); - serializeEnumerationSplitCountHistory(out, enumState.enumerationSplitCountHistory()); - byte[] result = out.getCopyOfBuffer(); - out.clear(); - return result; - } - - @VisibleForTesting - IcebergEnumeratorState deserializeV2(byte[] serialized) throws IOException { - DataInputDeserializer in = new DataInputDeserializer(serialized); - IcebergEnumeratorPosition enumeratorPosition = - deserializeEnumeratorPosition(in, positionSerializer); - Collection pendingSplits = - deserializePendingSplits(in, splitSerializer); - int[] enumerationSplitCountHistory = deserializeEnumerationSplitCountHistory(in); - return new IcebergEnumeratorState( - enumeratorPosition, pendingSplits, enumerationSplitCountHistory); - } - - private static void serializeEnumeratorPosition( - DataOutputSerializer out, - IcebergEnumeratorPosition enumeratorPosition, - IcebergEnumeratorPositionSerializer positionSerializer) - throws IOException { - out.writeBoolean(enumeratorPosition != null); - if (enumeratorPosition != null) { - out.writeInt(positionSerializer.getVersion()); - byte[] positionBytes = positionSerializer.serialize(enumeratorPosition); - out.writeInt(positionBytes.length); - out.write(positionBytes); - } - } - - private static IcebergEnumeratorPosition deserializeEnumeratorPosition( - DataInputDeserializer in, IcebergEnumeratorPositionSerializer positionSerializer) - throws IOException { - IcebergEnumeratorPosition enumeratorPosition = null; - if (in.readBoolean()) { - int version = in.readInt(); - byte[] positionBytes = new byte[in.readInt()]; - in.read(positionBytes); - enumeratorPosition = positionSerializer.deserialize(version, positionBytes); - } - return enumeratorPosition; - } - - private static void serializePendingSplits( - DataOutputSerializer out, - Collection pendingSplits, - IcebergSourceSplitSerializer splitSerializer) - throws IOException { - out.writeInt(splitSerializer.getVersion()); - out.writeInt(pendingSplits.size()); - for (IcebergSourceSplitState splitState : pendingSplits) { - byte[] splitBytes = splitSerializer.serialize(splitState.split()); - out.writeInt(splitBytes.length); - out.write(splitBytes); - out.writeUTF(splitState.status().name()); - } - } - - private static Collection deserializePendingSplits( - DataInputDeserializer in, IcebergSourceSplitSerializer splitSerializer) throws IOException { - int splitSerializerVersion = in.readInt(); - int splitCount = in.readInt(); - Collection pendingSplits = Lists.newArrayListWithCapacity(splitCount); - for (int i = 0; i < splitCount; ++i) { - byte[] splitBytes = new byte[in.readInt()]; - in.read(splitBytes); - IcebergSourceSplit split = splitSerializer.deserialize(splitSerializerVersion, splitBytes); - String statusName = in.readUTF(); - pendingSplits.add( - new IcebergSourceSplitState(split, IcebergSourceSplitStatus.valueOf(statusName))); - } - return pendingSplits; - } - - private static void serializeEnumerationSplitCountHistory( - DataOutputSerializer out, int[] enumerationSplitCountHistory) throws IOException { - out.writeInt(enumerationSplitCountHistory.length); - for (int enumerationSplitCount : enumerationSplitCountHistory) { - out.writeInt(enumerationSplitCount); - } - } - - private static int[] deserializeEnumerationSplitCountHistory(DataInputDeserializer in) - throws IOException { - int historySize = in.readInt(); - int[] history = new int[historySize]; - if (historySize > 0) { - for (int i = 0; i < historySize; ++i) { - history[i] = in.readInt(); - } - } - - return history; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java deleted file mode 100644 index 4e55ea5d5fd6..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.iceberg.flink.source.assigner.SplitAssigner; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; - -/** One-time split enumeration at the start-up for batch execution */ -@Internal -public class StaticIcebergEnumerator extends AbstractIcebergEnumerator { - private final SplitAssigner assigner; - - public StaticIcebergEnumerator( - SplitEnumeratorContext enumeratorContext, SplitAssigner assigner) { - super(enumeratorContext, assigner); - this.assigner = assigner; - } - - @Override - public void start() { - super.start(); - } - - @Override - protected boolean shouldWaitForMoreSplits() { - return false; - } - - @Override - public IcebergEnumeratorState snapshotState(long checkpointId) { - return new IcebergEnumeratorState(null, assigner.state(), new int[0]); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java deleted file mode 100644 index 7b94c364c976..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.util.Collections; -import java.util.Set; -import javax.annotation.Nullable; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.file.src.util.Pool; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * {@link RecordsWithSplitIds} is used to pass a batch of records from fetcher to source reader. - * Batching is to improve the efficiency for records handover. - * - *

    {@link RecordsWithSplitIds} interface can encapsulate batches from multiple splits. This is - * the case for Kafka source where fetchers can retrieve records from multiple Kafka partitions at - * the same time. - * - *

    For file-based sources like Iceberg, readers always read one split/file at a time. Hence, we - * will only have a batch of records for one split here. - * - *

    This class uses array to store a batch of records from the same file (with the same - * fileOffset). - */ -class ArrayBatchRecords implements RecordsWithSplitIds> { - @Nullable private String splitId; - @Nullable private final Pool.Recycler recycler; - @Nullable private final T[] records; - private final int numberOfRecords; - private final Set finishedSplits; - private final RecordAndPosition recordAndPosition; - - // point to current read position within the records array - private int position; - - private ArrayBatchRecords( - @Nullable String splitId, - @Nullable Pool.Recycler recycler, - @Nullable T[] records, - int numberOfRecords, - int fileOffset, - long startingRecordOffset, - Set finishedSplits) { - Preconditions.checkArgument(numberOfRecords >= 0, "numberOfRecords can't be negative"); - Preconditions.checkArgument(fileOffset >= 0, "fileOffset can't be negative"); - Preconditions.checkArgument(startingRecordOffset >= 0, "numberOfRecords can't be negative"); - - this.splitId = splitId; - this.recycler = recycler; - this.records = records; - this.numberOfRecords = numberOfRecords; - this.finishedSplits = - Preconditions.checkNotNull(finishedSplits, "finishedSplits can be empty but not null"); - this.recordAndPosition = new RecordAndPosition<>(); - - recordAndPosition.set(null, fileOffset, startingRecordOffset); - this.position = 0; - } - - @Nullable - @Override - public String nextSplit() { - String nextSplit = this.splitId; - // set the splitId to null to indicate no more splits - // this class only contains record for one split - this.splitId = null; - return nextSplit; - } - - @Nullable - @Override - public RecordAndPosition nextRecordFromSplit() { - if (position < numberOfRecords) { - recordAndPosition.record(records[position]); - position++; - return recordAndPosition; - } else { - return null; - } - } - - /** - * This method is called when all records from this batch has been emitted. If recycler is set, it - * should be called to return the records array back to pool. - */ - @Override - public void recycle() { - if (recycler != null) { - recycler.recycle(records); - } - } - - @Override - public Set finishedSplits() { - return finishedSplits; - } - - @VisibleForTesting - T[] records() { - return records; - } - - @VisibleForTesting - int numberOfRecords() { - return numberOfRecords; - } - - /** - * Create a ArrayBatchRecords backed up an array with records from the same file - * - * @param splitId Iceberg source only read from one split a time. We never have multiple records - * from multiple splits. - * @param recycler Because {@link DataIterator} with {@link RowData} returns an iterator of reused - * RowData object, we need to clone RowData eagerly when constructing a batch of records. We - * can use object pool to reuse the RowData array object which can be expensive to create. - * This recycler can be provided to recycle the array object back to pool after read is - * exhausted. If the {@link DataIterator} returns an iterator of non-reused objects, we don't - * need to clone objects. It is cheap to just create the batch array. Hence, we don't need - * object pool and recycler can be set to null. - * @param records an array (maybe reused) holding a batch of records - * @param numberOfRecords actual number of records in the array - * @param fileOffset fileOffset for all records in this batch - * @param startingRecordOffset starting recordOffset - * @param record type - */ - public static ArrayBatchRecords forRecords( - String splitId, - Pool.Recycler recycler, - T[] records, - int numberOfRecords, - int fileOffset, - long startingRecordOffset) { - return new ArrayBatchRecords<>( - splitId, - recycler, - records, - numberOfRecords, - fileOffset, - startingRecordOffset, - Collections.emptySet()); - } - - /** - * Create ab ArrayBatchRecords with only finished split id - * - * @param splitId for the split that is just exhausted - */ - public static ArrayBatchRecords finishedSplit(String splitId) { - return new ArrayBatchRecords<>(null, null, null, 0, 0, 0, Collections.singleton(splitId)); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java deleted file mode 100644 index 306afd1811be..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.IOException; -import java.util.NoSuchElementException; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.base.source.reader.SourceReaderOptions; -import org.apache.flink.connector.file.src.util.Pool; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** This implementation stores record batch in array from recyclable pool */ -class ArrayPoolDataIteratorBatcher implements DataIteratorBatcher { - private final int batchSize; - private final int handoverQueueSize; - private final RecordFactory recordFactory; - - private transient Pool pool; - - ArrayPoolDataIteratorBatcher(ReadableConfig config, RecordFactory recordFactory) { - this.batchSize = config.get(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT); - this.handoverQueueSize = config.get(SourceReaderOptions.ELEMENT_QUEUE_CAPACITY); - this.recordFactory = recordFactory; - } - - @Override - public CloseableIterator>> batch( - String splitId, DataIterator inputIterator) { - Preconditions.checkArgument(inputIterator != null, "Input data iterator can't be null"); - // lazily create pool as it is not serializable - if (pool == null) { - this.pool = createPoolOfBatches(handoverQueueSize); - } - return new ArrayPoolBatchIterator(splitId, inputIterator, pool); - } - - private Pool createPoolOfBatches(int numBatches) { - Pool poolOfBatches = new Pool<>(numBatches); - for (int batchId = 0; batchId < numBatches; batchId++) { - T[] batch = recordFactory.createBatch(batchSize); - poolOfBatches.add(batch); - } - - return poolOfBatches; - } - - private class ArrayPoolBatchIterator - implements CloseableIterator>> { - - private final String splitId; - private final DataIterator inputIterator; - private final Pool pool; - - ArrayPoolBatchIterator(String splitId, DataIterator inputIterator, Pool pool) { - this.splitId = splitId; - this.inputIterator = inputIterator; - this.pool = pool; - } - - @Override - public boolean hasNext() { - return inputIterator.hasNext(); - } - - @Override - public RecordsWithSplitIds> next() { - if (!inputIterator.hasNext()) { - throw new NoSuchElementException(); - } - - T[] batch = getCachedEntry(); - int recordCount = 0; - while (inputIterator.hasNext() && recordCount < batchSize) { - // The record produced by inputIterator can be reused like for the RowData case. - // inputIterator.next() can't be called again until the copy is made - // since the record is not consumed immediately. - T nextRecord = inputIterator.next(); - recordFactory.clone(nextRecord, batch, recordCount); - recordCount++; - if (!inputIterator.currentFileHasNext()) { - // break early so that records in the ArrayResultIterator - // have the same fileOffset. - break; - } - } - - return ArrayBatchRecords.forRecords( - splitId, - pool.recycler(), - batch, - recordCount, - inputIterator.fileOffset(), - inputIterator.recordOffset() - recordCount); - } - - @Override - public void close() throws IOException { - inputIterator.close(); - } - - private T[] getCachedEntry() { - try { - return pool.pollEntry(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new RuntimeException("Interrupted while waiting for array pool entry", e); - } - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java deleted file mode 100644 index b158b0871a53..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.formats.avro.RowDataToAvroConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; -import org.apache.flink.formats.avro.typeutils.GenericRecordAvroTypeInfo; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.DataType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.iceberg.avro.AvroSchemaUtil; -import org.apache.iceberg.flink.FlinkSchemaUtil; - -public class AvroGenericRecordConverter implements RowDataConverter { - private final Schema avroSchema; - private final RowDataToAvroConverters.RowDataToAvroConverter flinkConverter; - private final TypeInformation outputTypeInfo; - - private AvroGenericRecordConverter(Schema avroSchema, RowType rowType) { - this.avroSchema = avroSchema; - this.flinkConverter = RowDataToAvroConverters.createConverter(rowType); - this.outputTypeInfo = new GenericRecordAvroTypeInfo(avroSchema); - } - - public static AvroGenericRecordConverter fromIcebergSchema( - org.apache.iceberg.Schema icebergSchema, String tableName) { - RowType rowType = FlinkSchemaUtil.convert(icebergSchema); - Schema avroSchema = AvroSchemaUtil.convert(icebergSchema, tableName); - return new AvroGenericRecordConverter(avroSchema, rowType); - } - - public static AvroGenericRecordConverter fromAvroSchema(Schema avroSchema, String tableName) { - DataType dataType = AvroSchemaConverter.convertToDataType(avroSchema.toString()); - LogicalType logicalType = TypeConversions.fromDataToLogicalType(dataType); - RowType rowType = RowType.of(logicalType.getChildren().toArray(new LogicalType[0])); - return new AvroGenericRecordConverter(avroSchema, rowType); - } - - @Override - public GenericRecord apply(RowData rowData) { - return (GenericRecord) flinkConverter.convert(avroSchema, rowData); - } - - @Override - public TypeInformation getProducedType() { - return outputTypeInfo; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java deleted file mode 100644 index f89e5ce13474..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.util.List; -import org.apache.avro.generic.GenericRecord; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.source.AvroGenericRecordFileScanTaskReader; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.flink.source.IcebergSource; -import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; -import org.apache.iceberg.flink.source.RowDataToAvroGenericRecordConverter; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * Read Iceberg rows as {@link GenericRecord}. - * - * @deprecated since 1.7.0. Will be removed in 2.0.0; use {@link - * IcebergSource#forOutputType(RowDataConverter)} and {@link AvroGenericRecordConverter} - * instead. - */ -@Deprecated -public class AvroGenericRecordReaderFunction extends DataIteratorReaderFunction { - private final String tableName; - private final Schema readSchema; - private final FileIO io; - private final EncryptionManager encryption; - private final RowDataFileScanTaskReader rowDataReader; - - private transient RowDataToAvroGenericRecordConverter converter; - - /** - * Create a reader function without projection and name mapping. Column name is case-insensitive. - */ - public static AvroGenericRecordReaderFunction fromTable(Table table) { - return new AvroGenericRecordReaderFunction( - table.name(), - new Configuration(), - table.schema(), - null, - null, - false, - table.io(), - table.encryption(), - null); - } - - public AvroGenericRecordReaderFunction( - String tableName, - ReadableConfig config, - Schema tableSchema, - Schema projectedSchema, - String nameMapping, - boolean caseSensitive, - FileIO io, - EncryptionManager encryption, - List filters) { - super(new ListDataIteratorBatcher<>(config)); - this.tableName = tableName; - this.readSchema = readSchema(tableSchema, projectedSchema); - this.io = io; - this.encryption = encryption; - this.rowDataReader = - new RowDataFileScanTaskReader(tableSchema, readSchema, nameMapping, caseSensitive, filters); - } - - @Override - protected DataIterator createDataIterator(IcebergSourceSplit split) { - return new DataIterator<>( - new AvroGenericRecordFileScanTaskReader(rowDataReader, lazyConverter()), - split.task(), - io, - encryption); - } - - private RowDataToAvroGenericRecordConverter lazyConverter() { - if (converter == null) { - this.converter = RowDataToAvroGenericRecordConverter.fromIcebergSchema(tableName, readSchema); - } - return converter; - } - - private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { - Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); - return projectedSchema == null ? tableSchema : projectedSchema; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java deleted file mode 100644 index 4bb6f0a98c4c..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.Serializable; -import java.util.Comparator; -import java.util.concurrent.TimeUnit; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.types.Conversions; -import org.apache.iceberg.types.Type.TypeID; -import org.apache.iceberg.types.Types; - -/** - * {@link SplitWatermarkExtractor} implementation which uses an Iceberg timestamp column statistics - * to get the watermarks for the {@link IcebergSourceSplit}. This watermark is emitted by the {@link - * WatermarkExtractorRecordEmitter} along with the actual records. - */ -@Internal -public class ColumnStatsWatermarkExtractor implements SplitWatermarkExtractor, Serializable { - private final int eventTimeFieldId; - private final String eventTimeFieldName; - private final TimeUnit timeUnit; - - /** - * Creates the extractor. - * - * @param schema The schema of the Table - * @param eventTimeFieldName The column which should be used as an event time - * @param timeUnit Used for converting the long value to epoch milliseconds - */ - public ColumnStatsWatermarkExtractor( - Schema schema, String eventTimeFieldName, TimeUnit timeUnit) { - Types.NestedField field = schema.findField(eventTimeFieldName); - TypeID typeID = field.type().typeId(); - Preconditions.checkArgument( - typeID.equals(TypeID.LONG) || typeID.equals(TypeID.TIMESTAMP), - "Found %s, expected a LONG or TIMESTAMP column for watermark generation.", - typeID); - this.eventTimeFieldId = field.fieldId(); - this.eventTimeFieldName = eventTimeFieldName; - // Use the timeUnit only for Long columns. - this.timeUnit = typeID.equals(TypeID.LONG) ? timeUnit : TimeUnit.MICROSECONDS; - } - - @VisibleForTesting - ColumnStatsWatermarkExtractor(int eventTimeFieldId, String eventTimeFieldName) { - this.eventTimeFieldId = eventTimeFieldId; - this.eventTimeFieldName = eventTimeFieldName; - this.timeUnit = TimeUnit.MICROSECONDS; - } - - /** - * Get the watermark for a split using column statistics. - * - * @param split The split - * @return The watermark - * @throws IllegalArgumentException if there is no statistics for the column - */ - @Override - public long extractWatermark(IcebergSourceSplit split) { - return split.task().files().stream() - .map( - scanTask -> { - Preconditions.checkArgument( - scanTask.file().lowerBounds() != null - && scanTask.file().lowerBounds().get(eventTimeFieldId) != null, - "Missing statistics for column name = %s in file = %s", - eventTimeFieldName, - eventTimeFieldId, - scanTask.file()); - return timeUnit.toMillis( - Conversions.fromByteBuffer( - Types.LongType.get(), scanTask.file().lowerBounds().get(eventTimeFieldId))); - }) - .min(Comparator.comparingLong(l -> l)) - .get(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ConverterReaderFunction.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ConverterReaderFunction.java deleted file mode 100644 index e1e7c17d63c5..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ConverterReaderFunction.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.util.List; -import org.apache.flink.annotation.Internal; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.Schema; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.encryption.InputFilesDecryptor; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.flink.source.FileScanTaskReader; -import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -@Internal -public class ConverterReaderFunction extends DataIteratorReaderFunction { - private final RowDataConverter converter; - private final Schema tableSchema; - private final Schema readSchema; - private final String nameMapping; - private final boolean caseSensitive; - private final FileIO io; - private final EncryptionManager encryption; - private final List filters; - private final long limit; - - private transient RecordLimiter recordLimiter = null; - - public ConverterReaderFunction( - RowDataConverter converter, - ReadableConfig config, - Schema tableSchema, - Schema projectedSchema, - String nameMapping, - boolean caseSensitive, - FileIO io, - EncryptionManager encryption, - List filters, - long limit) { - super(new ListDataIteratorBatcher<>(config)); - this.converter = converter; - this.tableSchema = tableSchema; - this.readSchema = readSchema(tableSchema, projectedSchema); - this.nameMapping = nameMapping; - this.caseSensitive = caseSensitive; - this.io = io; - this.encryption = encryption; - this.filters = filters; - this.limit = limit; - } - - @Override - protected DataIterator createDataIterator(IcebergSourceSplit split) { - RowDataFileScanTaskReader rowDataReader = - new RowDataFileScanTaskReader(tableSchema, readSchema, nameMapping, caseSensitive, filters); - return new LimitableDataIterator<>( - new ConverterFileScanTaskReader<>(rowDataReader, converter), - split.task(), - io, - encryption, - lazyLimiter()); - } - - private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { - Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); - return projectedSchema == null ? tableSchema : projectedSchema; - } - - /** Lazily create RecordLimiter to avoid the need to make it serializable */ - private RecordLimiter lazyLimiter() { - if (recordLimiter == null) { - this.recordLimiter = RecordLimiter.create(limit); - } - - return recordLimiter; - } - - private static class ConverterFileScanTaskReader implements FileScanTaskReader { - private final RowDataFileScanTaskReader rowDataReader; - private final RowDataConverter converter; - - ConverterFileScanTaskReader( - RowDataFileScanTaskReader rowDataReader, RowDataConverter converter) { - this.rowDataReader = rowDataReader; - this.converter = converter; - } - - @Override - public CloseableIterator open( - FileScanTask fileScanTask, InputFilesDecryptor inputFilesDecryptor) { - return CloseableIterator.transform( - rowDataReader.open(fileScanTask, inputFilesDecryptor), converter); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java deleted file mode 100644 index c376e359c600..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.Serializable; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.io.CloseableIterator; - -/** - * Batcher converts iterator of T into iterator of batched {@code - * RecordsWithSplitIds>}, as FLIP-27's {@link SplitReader#fetch()} returns - * batched records. - */ -@FunctionalInterface -public interface DataIteratorBatcher extends Serializable { - CloseableIterator>> batch( - String splitId, DataIterator inputIterator); -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java deleted file mode 100644 index bbf797ef4aa8..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.io.CloseableIterator; - -/** A {@link ReaderFunction} implementation that uses {@link DataIterator}. */ -public abstract class DataIteratorReaderFunction implements ReaderFunction { - private final DataIteratorBatcher batcher; - - public DataIteratorReaderFunction(DataIteratorBatcher batcher) { - this.batcher = batcher; - } - - protected abstract DataIterator createDataIterator(IcebergSourceSplit split); - - @Override - public CloseableIterator>> apply( - IcebergSourceSplit split) { - DataIterator inputIterator = createDataIterator(split); - inputIterator.seek(split.fileOffset(), split.recordOffset()); - return batcher.batch(split.splitId(), inputIterator); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java deleted file mode 100644 index f143b8d2df2e..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.util.Collection; -import java.util.Collections; -import java.util.Map; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.connector.base.source.reader.SingleThreadMultiplexSourceReaderBase; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.SerializableComparator; -import org.apache.iceberg.flink.source.split.SplitRequestEvent; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -@Internal -public class IcebergSourceReader - extends SingleThreadMultiplexSourceReaderBase< - RecordAndPosition, T, IcebergSourceSplit, IcebergSourceSplit> { - - public IcebergSourceReader( - SerializableRecordEmitter emitter, - IcebergSourceReaderMetrics metrics, - ReaderFunction readerFunction, - SerializableComparator splitComparator, - SourceReaderContext context) { - super( - () -> new IcebergSourceSplitReader<>(metrics, readerFunction, splitComparator, context), - emitter, - context.getConfiguration(), - context); - } - - @Override - public void start() { - // We request a split only if we did not get splits during the checkpoint restore. - // Otherwise, reader restarts will keep requesting more and more splits. - if (getNumberOfCurrentlyAssignedSplits() == 0) { - requestSplit(Collections.emptyList()); - } - } - - @Override - protected void onSplitFinished(Map finishedSplitIds) { - requestSplit(Lists.newArrayList(finishedSplitIds.keySet())); - } - - @Override - protected IcebergSourceSplit initializedState(IcebergSourceSplit split) { - return split; - } - - @Override - protected IcebergSourceSplit toSplitType(String splitId, IcebergSourceSplit splitState) { - return splitState; - } - - private void requestSplit(Collection finishedSplitIds) { - context.sendSourceEventToCoordinator(new SplitRequestEvent(finishedSplitIds)); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java deleted file mode 100644 index 2a3e1dd86b95..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import org.apache.flink.metrics.Counter; -import org.apache.flink.metrics.MetricGroup; - -public class IcebergSourceReaderMetrics { - private final Counter assignedSplits; - private final Counter assignedBytes; - private final Counter finishedSplits; - private final Counter finishedBytes; - private final Counter splitReaderFetchCalls; - - public IcebergSourceReaderMetrics(MetricGroup metrics, String fullTableName) { - MetricGroup readerMetrics = - metrics.addGroup("IcebergSourceReader").addGroup("table", fullTableName); - - this.assignedSplits = readerMetrics.counter("assignedSplits"); - this.assignedBytes = readerMetrics.counter("assignedBytes"); - this.finishedSplits = readerMetrics.counter("finishedSplits"); - this.finishedBytes = readerMetrics.counter("finishedBytes"); - this.splitReaderFetchCalls = readerMetrics.counter("splitReaderFetchCalls"); - } - - public void incrementAssignedSplits(long count) { - assignedSplits.inc(count); - } - - public void incrementAssignedBytes(long count) { - assignedBytes.inc(count); - } - - public void incrementFinishedSplits(long count) { - finishedSplits.inc(count); - } - - public void incrementFinishedBytes(long count) { - finishedBytes.inc(count); - } - - public void incrementSplitReaderFetchCalls(long count) { - splitReaderFetchCalls.inc(count); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java deleted file mode 100644 index bcd72e25036b..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Queue; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.connector.base.source.reader.RecordsBySplits; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; -import org.apache.flink.connector.base.source.reader.splitreader.SplitsAddition; -import org.apache.flink.connector.base.source.reader.splitreader.SplitsChange; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.SerializableComparator; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Queues; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -class IcebergSourceSplitReader implements SplitReader, IcebergSourceSplit> { - private static final Logger LOG = LoggerFactory.getLogger(IcebergSourceSplitReader.class); - - private final IcebergSourceReaderMetrics metrics; - private final ReaderFunction openSplitFunction; - private final SerializableComparator splitComparator; - private final int indexOfSubtask; - private final Queue splits; - - private CloseableIterator>> currentReader; - private IcebergSourceSplit currentSplit; - private String currentSplitId; - - IcebergSourceSplitReader( - IcebergSourceReaderMetrics metrics, - ReaderFunction openSplitFunction, - SerializableComparator splitComparator, - SourceReaderContext context) { - this.metrics = metrics; - this.openSplitFunction = openSplitFunction; - this.splitComparator = splitComparator; - this.indexOfSubtask = context.getIndexOfSubtask(); - this.splits = Queues.newArrayDeque(); - } - - /** - * The method reads a batch of records from the assigned splits. If all the records from the - * current split are returned then it will emit a {@link ArrayBatchRecords#finishedSplit(String)} - * batch to signal this event. In the next fetch loop the reader will continue with the next split - * (if any). - * - * @return The fetched records - * @throws IOException If there is an error during reading - */ - @Override - public RecordsWithSplitIds> fetch() throws IOException { - metrics.incrementSplitReaderFetchCalls(1); - if (currentReader == null) { - IcebergSourceSplit nextSplit = splits.poll(); - if (nextSplit != null) { - currentSplit = nextSplit; - currentSplitId = nextSplit.splitId(); - currentReader = openSplitFunction.apply(currentSplit); - } else { - // return an empty result, which will lead to split fetch to be idle. - // SplitFetcherManager will then close idle fetcher. - return new RecordsBySplits<>(Collections.emptyMap(), Collections.emptySet()); - } - } - - if (currentReader.hasNext()) { - // Because Iterator#next() doesn't support checked exception, - // we need to wrap and unwrap the checked IOException with UncheckedIOException - try { - return currentReader.next(); - } catch (UncheckedIOException e) { - throw e.getCause(); - } - } else { - return finishSplit(); - } - } - - @Override - public void handleSplitsChanges(SplitsChange splitsChange) { - if (!(splitsChange instanceof SplitsAddition)) { - throw new UnsupportedOperationException( - String.format("Unsupported split change: %s", splitsChange.getClass())); - } - - if (splitComparator != null) { - List newSplits = Lists.newArrayList(splitsChange.splits()); - newSplits.sort(splitComparator); - LOG.info("Add {} splits to reader: {}", newSplits.size(), newSplits); - splits.addAll(newSplits); - } else { - LOG.info("Add {} splits to reader", splitsChange.splits().size()); - splits.addAll(splitsChange.splits()); - } - metrics.incrementAssignedSplits(splitsChange.splits().size()); - metrics.incrementAssignedBytes(calculateBytes(splitsChange)); - } - - @Override - public void wakeUp() {} - - @Override - public void close() throws Exception { - currentSplitId = null; - if (currentReader != null) { - currentReader.close(); - } - } - - @Override - public void pauseOrResumeSplits( - Collection splitsToPause, Collection splitsToResume) { - // IcebergSourceSplitReader only reads splits sequentially. When waiting for watermark alignment - // the SourceOperator will stop processing and recycling the fetched batches. This exhausts the - // {@link ArrayPoolDataIteratorBatcher#pool} and the `currentReader.next()` call will be - // blocked even without split-level watermark alignment. Based on this the - // `pauseOrResumeSplits` and the `wakeUp` are left empty. - } - - private long calculateBytes(IcebergSourceSplit split) { - return split.task().files().stream().map(FileScanTask::length).reduce(0L, Long::sum); - } - - private long calculateBytes(SplitsChange splitsChanges) { - return splitsChanges.splits().stream().map(this::calculateBytes).reduce(0L, Long::sum); - } - - private ArrayBatchRecords finishSplit() throws IOException { - if (currentReader != null) { - currentReader.close(); - currentReader = null; - } - - ArrayBatchRecords finishRecords = ArrayBatchRecords.finishedSplit(currentSplitId); - LOG.info("Split reader {} finished split: {}", indexOfSubtask, currentSplitId); - metrics.incrementFinishedSplits(1); - metrics.incrementFinishedBytes(calculateBytes(currentSplit)); - currentSplitId = null; - return finishRecords; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java deleted file mode 100644 index 020e87646d05..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.flink.source.FileScanTaskReader; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -class LimitableDataIterator extends DataIterator { - private final RecordLimiter limiter; - - LimitableDataIterator( - FileScanTaskReader fileScanTaskReader, - CombinedScanTask task, - FileIO io, - EncryptionManager encryption, - RecordLimiter limiter) { - super(fileScanTaskReader, task, io, encryption); - Preconditions.checkArgument(limiter != null, "Invalid record limiter: null"); - this.limiter = limiter; - } - - @Override - public boolean hasNext() { - if (limiter.reachedLimit()) { - return false; - } - - return super.hasNext(); - } - - @Override - public T next() { - limiter.increment(); - return super.next(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java deleted file mode 100644 index 1acb3df76102..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.util.Collections; -import java.util.List; -import java.util.Set; -import javax.annotation.Nullable; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -class ListBatchRecords implements RecordsWithSplitIds> { - private String splitId; - private final List records; - private final Set finishedSplits; - private final RecordAndPosition recordAndPosition; - - // point to current read position within the records list - private int position; - - ListBatchRecords( - String splitId, - List records, - int fileOffset, - long startingRecordOffset, - Set finishedSplits) { - this.splitId = splitId; - this.records = records; - this.finishedSplits = - Preconditions.checkNotNull(finishedSplits, "finishedSplits can be empty but not null"); - this.recordAndPosition = new RecordAndPosition<>(); - this.recordAndPosition.set(null, fileOffset, startingRecordOffset); - - this.position = 0; - } - - @Nullable - @Override - public String nextSplit() { - String nextSplit = this.splitId; - // set the splitId to null to indicate no more splits - // this class only contains record for one split - this.splitId = null; - return nextSplit; - } - - @Nullable - @Override - public RecordAndPosition nextRecordFromSplit() { - if (position < records.size()) { - recordAndPosition.record(records.get(position)); - position++; - return recordAndPosition; - } else { - return null; - } - } - - @Override - public Set finishedSplits() { - return finishedSplits; - } - - public static ListBatchRecords forRecords( - String splitId, List records, int fileOffset, long startingRecordOffset) { - return new ListBatchRecords<>( - splitId, records, fileOffset, startingRecordOffset, Collections.emptySet()); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java deleted file mode 100644 index 365416239d37..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.IOException; -import java.util.List; -import java.util.NoSuchElementException; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -/** - * FlinkRecordReaderFunction essentially cloned objects already. So there is no need to use array - * pool to clone objects. Simply create a new ArrayList for each batch. - */ -class ListDataIteratorBatcher implements DataIteratorBatcher { - - private final int batchSize; - - ListDataIteratorBatcher(ReadableConfig config) { - this.batchSize = config.get(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT); - } - - @Override - public CloseableIterator>> batch( - String splitId, DataIterator dataIterator) { - return new ListBatchIterator(splitId, dataIterator); - } - - private class ListBatchIterator - implements CloseableIterator>> { - - private final String splitId; - private final DataIterator inputIterator; - - ListBatchIterator(String splitId, DataIterator inputIterator) { - this.splitId = splitId; - this.inputIterator = inputIterator; - } - - @Override - public boolean hasNext() { - return inputIterator.hasNext(); - } - - @Override - public RecordsWithSplitIds> next() { - if (!inputIterator.hasNext()) { - throw new NoSuchElementException(); - } - - final List batch = Lists.newArrayListWithCapacity(batchSize); - int recordCount = 0; - while (inputIterator.hasNext() && recordCount < batchSize) { - T nextRecord = inputIterator.next(); - batch.add(nextRecord); - recordCount++; - if (!inputIterator.currentFileHasNext()) { - // break early so that records have the same fileOffset. - break; - } - } - - return ListBatchRecords.forRecords( - splitId, batch, inputIterator.fileOffset(), inputIterator.recordOffset() - recordCount); - } - - @Override - public void close() throws IOException { - if (inputIterator != null) { - inputIterator.close(); - } - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java deleted file mode 100644 index fb4466913b90..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.flink.source.DataTaskReader; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** Reading metadata tables (like snapshots, manifests, etc.) */ -@Internal -public class MetaDataReaderFunction extends DataIteratorReaderFunction { - private final Schema readSchema; - private final FileIO io; - private final EncryptionManager encryption; - - public MetaDataReaderFunction( - ReadableConfig config, - Schema tableSchema, - Schema projectedSchema, - FileIO io, - EncryptionManager encryption) { - super( - new ArrayPoolDataIteratorBatcher<>( - config, - new RowDataRecordFactory( - FlinkSchemaUtil.convert(readSchema(tableSchema, projectedSchema))))); - this.readSchema = readSchema(tableSchema, projectedSchema); - this.io = io; - this.encryption = encryption; - } - - @Override - public DataIterator createDataIterator(IcebergSourceSplit split) { - return new DataIterator<>(new DataTaskReader(readSchema), split.task(), io, encryption); - } - - private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { - Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); - return projectedSchema == null ? tableSchema : projectedSchema; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java deleted file mode 100644 index 1ea91f10b4e7..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.Serializable; -import java.util.function.Function; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.io.CloseableIterator; - -@FunctionalInterface -public interface ReaderFunction - extends Serializable, - Function< - IcebergSourceSplit, CloseableIterator>>> {} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java deleted file mode 100644 index 10e7d2037a30..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.util.Locale; -import org.apache.flink.annotation.Internal; - -/** - * A record along with the reader position to be stored in the checkpoint. - * - *

    The position defines the point in the reader AFTER the record. Record processing and updating - * checkpointed state happens atomically. The position points to where the reader should resume - * after this record is processed. - * - *

    This mutable object is useful in cases where only one instance of a {@code RecordAndPosition} - * is needed at a time. Then the same instance of RecordAndPosition can be reused. - */ -@Internal -public class RecordAndPosition { - private T record; - private int fileOffset; - private long recordOffset; - - public RecordAndPosition(T record, int fileOffset, long recordOffset) { - this.record = record; - this.fileOffset = fileOffset; - this.recordOffset = recordOffset; - } - - public RecordAndPosition() {} - - // ------------------------------------------------------------------------ - - public T record() { - return record; - } - - public int fileOffset() { - return fileOffset; - } - - public long recordOffset() { - return recordOffset; - } - - /** Updates the record and position in this object. */ - public void set(T newRecord, int newFileOffset, long newRecordOffset) { - this.record = newRecord; - this.fileOffset = newFileOffset; - this.recordOffset = newRecordOffset; - } - - /** Sets the next record of a sequence. This increments the {@code recordOffset} by one. */ - public void record(T nextRecord) { - this.record = nextRecord; - this.recordOffset++; - } - - @Override - public String toString() { - return String.format(Locale.ROOT, "%s @ %d + %d", record, fileOffset, recordOffset); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java deleted file mode 100644 index ef92e2e6b81f..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.Serializable; - -/** - * In FLIP-27 source, SplitReader#fetch() returns a batch of records. Since DataIterator for RowData - * returns an iterator of reused RowData objects, RecordFactory is needed to (1) create object array - * that is recyclable via pool. (2) clone RowData element from DataIterator to the batch array. - */ -interface RecordFactory extends Serializable { - /** Create a batch of records */ - T[] createBatch(int batchSize); - - /** Clone record into the specified position of the batch array */ - void clone(T from, T[] batch, int position); -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java deleted file mode 100644 index f260a53089ff..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.util.concurrent.atomic.AtomicLong; -import org.apache.flink.annotation.Internal; - -@Internal -class RecordLimiter { - private final long limit; - private final AtomicLong counter; - - static RecordLimiter create(long limit) { - return new RecordLimiter(limit); - } - - private RecordLimiter(long limit) { - this.limit = limit; - this.counter = new AtomicLong(0); - } - - public boolean reachedLimit() { - return limit > 0 && counter.get() >= limit; - } - - public void increment() { - counter.incrementAndGet(); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowConverter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowConverter.java deleted file mode 100644 index 0e028ff91b87..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowConverter.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.java.typeutils.RowTypeInfo; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.conversion.DataStructureConverter; -import org.apache.flink.table.data.conversion.DataStructureConverters; -import org.apache.flink.table.runtime.typeutils.ExternalTypeInfo; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.flink.types.Row; -import org.apache.iceberg.flink.FlinkSchemaUtil; - -public class RowConverter implements RowDataConverter { - private final DataStructureConverter converter; - private final TypeInformation outputTypeInfo; - - private RowConverter(RowType rowType, TypeInformation rowTypeInfo) { - this.converter = - DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(rowType)); - this.outputTypeInfo = rowTypeInfo; - } - - public static RowConverter fromIcebergSchema(org.apache.iceberg.Schema icebergSchema) { - RowType rowType = FlinkSchemaUtil.convert(icebergSchema); - ResolvedSchema resolvedSchema = FlinkSchemaUtil.toResolvedSchema(icebergSchema); - TypeInformation[] types = - resolvedSchema.getColumnDataTypes().stream() - .map(ExternalTypeInfo::of) - .toArray(TypeInformation[]::new); - String[] fieldNames = resolvedSchema.getColumnNames().toArray(String[]::new); - RowTypeInfo rowTypeInfo = new RowTypeInfo(types, fieldNames); - return new RowConverter(rowType, rowTypeInfo); - } - - @Override - public Row apply(RowData rowData) { - return (Row) converter.toExternal(rowData); - } - - @Override - public TypeInformation getProducedType() { - return outputTypeInfo; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataConverter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataConverter.java deleted file mode 100644 index 98bb7e981840..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataConverter.java +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.Serializable; -import java.util.function.Function; -import org.apache.flink.api.java.typeutils.ResultTypeQueryable; -import org.apache.flink.table.data.RowData; - -/** - * Convert RowData to a different output type. - * - * @param output type - */ -public interface RowDataConverter - extends Function, ResultTypeQueryable, Serializable {} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java deleted file mode 100644 index c9208a0e1834..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.util.List; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -public class RowDataReaderFunction extends DataIteratorReaderFunction { - private final Schema tableSchema; - private final Schema readSchema; - private final String nameMapping; - private final boolean caseSensitive; - private final FileIO io; - private final EncryptionManager encryption; - private final List filters; - private final long limit; - - private transient RecordLimiter recordLimiter = null; - - public RowDataReaderFunction( - ReadableConfig config, - Schema tableSchema, - Schema projectedSchema, - String nameMapping, - boolean caseSensitive, - FileIO io, - EncryptionManager encryption, - List filters) { - this( - config, - tableSchema, - projectedSchema, - nameMapping, - caseSensitive, - io, - encryption, - filters, - -1L); - } - - public RowDataReaderFunction( - ReadableConfig config, - Schema tableSchema, - Schema projectedSchema, - String nameMapping, - boolean caseSensitive, - FileIO io, - EncryptionManager encryption, - List filters, - long limit) { - super( - new ArrayPoolDataIteratorBatcher<>( - config, - new RowDataRecordFactory( - FlinkSchemaUtil.convert(readSchema(tableSchema, projectedSchema))))); - this.tableSchema = tableSchema; - this.readSchema = readSchema(tableSchema, projectedSchema); - this.nameMapping = nameMapping; - this.caseSensitive = caseSensitive; - this.io = io; - this.encryption = encryption; - this.filters = filters; - this.limit = limit; - } - - @Override - public DataIterator createDataIterator(IcebergSourceSplit split) { - return new LimitableDataIterator<>( - new RowDataFileScanTaskReader(tableSchema, readSchema, nameMapping, caseSensitive, filters), - split.task(), - io, - encryption, - lazyLimiter()); - } - - private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { - Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); - return projectedSchema == null ? tableSchema : projectedSchema; - } - - /** Lazily create RecordLimiter to avoid the need to make it serializable */ - private RecordLimiter lazyLimiter() { - if (recordLimiter == null) { - this.recordLimiter = RecordLimiter.create(limit); - } - - return recordLimiter; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java deleted file mode 100644 index ef2eedcf3cdd..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.runtime.typeutils.InternalSerializers; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.flink.FlinkRowData; -import org.apache.iceberg.flink.data.RowDataUtil; - -class RowDataRecordFactory implements RecordFactory { - private final RowType rowType; - private final TypeSerializer[] fieldSerializers; - private final RowData.FieldGetter[] fieldGetters; - - RowDataRecordFactory(RowType rowType) { - this.rowType = rowType; - this.fieldSerializers = createFieldSerializers(rowType); - this.fieldGetters = createFieldGetters(rowType); - } - - static TypeSerializer[] createFieldSerializers(RowType rowType) { - return rowType.getChildren().stream() - .map(InternalSerializers::create) - .toArray(TypeSerializer[]::new); - } - - static RowData.FieldGetter[] createFieldGetters(RowType rowType) { - RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[rowType.getFieldCount()]; - for (int i = 0; i < rowType.getFieldCount(); ++i) { - fieldGetters[i] = FlinkRowData.createFieldGetter(rowType.getTypeAt(i), i); - } - - return fieldGetters; - } - - @Override - public RowData[] createBatch(int batchSize) { - RowData[] arr = new RowData[batchSize]; - for (int i = 0; i < batchSize; ++i) { - arr[i] = new GenericRowData(rowType.getFieldCount()); - } - return arr; - } - - @Override - public void clone(RowData from, RowData[] batch, int position) { - // Set the return value from RowDataUtil.clone back to the array. - // Clone method returns same clone target object (reused) if it is a GenericRowData. - // Clone method will allocate a new GenericRowData object - // if the target object is NOT a GenericRowData. - // So we should always set the clone return value back to the array. - batch[position] = - RowDataUtil.clone(from, batch[position], rowType, fieldSerializers, fieldGetters); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java deleted file mode 100644 index a6e2c1dae243..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.Serializable; -import org.apache.flink.annotation.Internal; -import org.apache.flink.connector.base.source.reader.RecordEmitter; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; - -@Internal -@FunctionalInterface -public interface SerializableRecordEmitter - extends RecordEmitter, T, IcebergSourceSplit>, Serializable { - static SerializableRecordEmitter defaultEmitter() { - return (element, output, split) -> { - output.collect(element.record()); - split.updatePosition(element.fileOffset(), element.recordOffset()); - }; - } - - static SerializableRecordEmitter emitterWithWatermark(SplitWatermarkExtractor extractor) { - return new WatermarkExtractorRecordEmitter<>(extractor); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java deleted file mode 100644 index d1c50ac8ca52..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.Serializable; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; - -/** The interface used to extract watermarks from splits. */ -public interface SplitWatermarkExtractor extends Serializable { - /** Get the watermark for a split. */ - long extractWatermark(IcebergSourceSplit split); -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java deleted file mode 100644 index 02ef57d344b1..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import org.apache.flink.api.common.eventtime.Watermark; -import org.apache.flink.api.connector.source.SourceOutput; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Emitter which emits the watermarks, records and updates the split position. - * - *

    The Emitter emits watermarks at the beginning of every split provided by the {@link - * SplitWatermarkExtractor}. - */ -class WatermarkExtractorRecordEmitter implements SerializableRecordEmitter { - private static final Logger LOG = LoggerFactory.getLogger(WatermarkExtractorRecordEmitter.class); - private final SplitWatermarkExtractor timeExtractor; - private String lastSplitId = null; - private long watermark; - - WatermarkExtractorRecordEmitter(SplitWatermarkExtractor timeExtractor) { - this.timeExtractor = timeExtractor; - } - - @Override - public void emitRecord( - RecordAndPosition element, SourceOutput output, IcebergSourceSplit split) { - if (!split.splitId().equals(lastSplitId)) { - long newWatermark = timeExtractor.extractWatermark(split); - if (newWatermark < watermark) { - LOG.info( - "Received a new split with lower watermark. Previous watermark = {}, current watermark = {}, previous split = {}, current split = {}", - watermark, - newWatermark, - lastSplitId, - split.splitId()); - } else { - watermark = newWatermark; - output.emitWatermark(new Watermark(watermark)); - LOG.debug("Watermark = {} emitted based on split = {}", watermark, lastSplitId); - } - - lastSplitId = split.splitId(); - } - - output.collect(element.record()); - split.updatePosition(element.fileOffset(), element.recordOffset()); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java deleted file mode 100644 index b6d6f60ef673..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.split; - -import java.io.IOException; -import java.io.Serializable; -import java.util.Collection; -import java.util.List; -import java.util.stream.Collectors; -import javax.annotation.Nullable; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.connector.source.SourceSplit; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.flink.util.InstantiationUtil; -import org.apache.iceberg.BaseCombinedScanTask; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.ScanTaskParser; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -@Internal -public class IcebergSourceSplit implements SourceSplit, Serializable { - private static final long serialVersionUID = 1L; - private static final ThreadLocal SERIALIZER_CACHE = - ThreadLocal.withInitial(() -> new DataOutputSerializer(1024)); - - private final CombinedScanTask task; - - private int fileOffset; - private long recordOffset; - - // The splits are frequently serialized into checkpoints. - // Caching the byte representation makes repeated serialization cheap. - @Nullable private transient byte[] serializedBytesCache; - - private IcebergSourceSplit(CombinedScanTask task, int fileOffset, long recordOffset) { - this.task = task; - this.fileOffset = fileOffset; - this.recordOffset = recordOffset; - } - - public static IcebergSourceSplit fromCombinedScanTask(CombinedScanTask combinedScanTask) { - return fromCombinedScanTask(combinedScanTask, 0, 0L); - } - - public static IcebergSourceSplit fromCombinedScanTask( - CombinedScanTask combinedScanTask, int fileOffset, long recordOffset) { - return new IcebergSourceSplit(combinedScanTask, fileOffset, recordOffset); - } - - public CombinedScanTask task() { - return task; - } - - public int fileOffset() { - return fileOffset; - } - - public long recordOffset() { - return recordOffset; - } - - @Override - public String splitId() { - return MoreObjects.toStringHelper(this).add("files", toString(task.files())).toString(); - } - - public void updatePosition(int newFileOffset, long newRecordOffset) { - // invalidate the cache after position change - serializedBytesCache = null; - fileOffset = newFileOffset; - recordOffset = newRecordOffset; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("files", toString(task.files())) - .add("fileOffset", fileOffset) - .add("recordOffset", recordOffset) - .toString(); - } - - private String toString(Collection files) { - return Iterables.toString( - files.stream() - .map( - fileScanTask -> - MoreObjects.toStringHelper(fileScanTask) - .add("file", fileScanTask.file().location()) - .add("start", fileScanTask.start()) - .add("length", fileScanTask.length()) - .toString()) - .collect(Collectors.toList())); - } - - byte[] serializeV1() throws IOException { - if (serializedBytesCache == null) { - serializedBytesCache = InstantiationUtil.serializeObject(this); - } - - return serializedBytesCache; - } - - static IcebergSourceSplit deserializeV1(byte[] serialized) throws IOException { - try { - return InstantiationUtil.deserializeObject( - serialized, IcebergSourceSplit.class.getClassLoader()); - } catch (ClassNotFoundException e) { - throw new RuntimeException("Failed to deserialize the split.", e); - } - } - - byte[] serializeV2() throws IOException { - return serialize(2); - } - - byte[] serializeV3() throws IOException { - return serialize(3); - } - - private byte[] serialize(int version) throws IOException { - if (serializedBytesCache == null) { - DataOutputSerializer out = SERIALIZER_CACHE.get(); - Collection fileScanTasks = task.tasks(); - Preconditions.checkArgument( - fileOffset >= 0 && fileOffset < fileScanTasks.size(), - "Invalid file offset: %s. Should be within the range of [0, %s)", - fileOffset, - fileScanTasks.size()); - - out.writeInt(fileOffset); - out.writeLong(recordOffset); - out.writeInt(fileScanTasks.size()); - - for (FileScanTask fileScanTask : fileScanTasks) { - String taskJson = ScanTaskParser.toJson(fileScanTask); - writeTaskJson(out, taskJson, version); - } - - serializedBytesCache = out.getCopyOfBuffer(); - out.clear(); - } - - return serializedBytesCache; - } - - private static void writeTaskJson(DataOutputSerializer out, String taskJson, int version) - throws IOException { - switch (version) { - case 2: - out.writeUTF(taskJson); - break; - case 3: - SerializerHelper.writeLongUTF(out, taskJson); - break; - default: - throw new IllegalArgumentException("Unsupported version: " + version); - } - } - - static IcebergSourceSplit deserializeV2(byte[] serialized, boolean caseSensitive) - throws IOException { - return deserialize(serialized, caseSensitive, 2); - } - - static IcebergSourceSplit deserializeV3(byte[] serialized, boolean caseSensitive) - throws IOException { - return deserialize(serialized, caseSensitive, 3); - } - - private static IcebergSourceSplit deserialize( - byte[] serialized, boolean caseSensitive, int version) throws IOException { - DataInputDeserializer in = new DataInputDeserializer(serialized); - int fileOffset = in.readInt(); - long recordOffset = in.readLong(); - int taskCount = in.readInt(); - - List tasks = Lists.newArrayListWithCapacity(taskCount); - for (int i = 0; i < taskCount; ++i) { - String taskJson = readTaskJson(in, version); - FileScanTask task = ScanTaskParser.fromJson(taskJson, caseSensitive); - tasks.add(task); - } - - CombinedScanTask combinedScanTask = new BaseCombinedScanTask(tasks); - return IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, fileOffset, recordOffset); - } - - private static String readTaskJson(DataInputDeserializer in, int version) throws IOException { - switch (version) { - case 2: - return in.readUTF(); - case 3: - return SerializerHelper.readLongUTF(in); - default: - throw new IllegalArgumentException("Unsupported version: " + version); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java deleted file mode 100644 index d90d1dc88c91..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.split; - -import java.io.IOException; -import java.util.Locale; -import org.apache.flink.annotation.Internal; -import org.apache.flink.core.io.SimpleVersionedSerializer; - -@Internal -public class IcebergSourceSplitSerializer implements SimpleVersionedSerializer { - private static final int VERSION = 3; - - private final boolean caseSensitive; - - public IcebergSourceSplitSerializer(boolean caseSensitive) { - this.caseSensitive = caseSensitive; - } - - @Override - public int getVersion() { - return VERSION; - } - - @Override - public byte[] serialize(IcebergSourceSplit split) throws IOException { - return split.serializeV3(); - } - - @Override - public IcebergSourceSplit deserialize(int version, byte[] serialized) throws IOException { - switch (version) { - case 1: - return IcebergSourceSplit.deserializeV1(serialized); - case 2: - return IcebergSourceSplit.deserializeV2(serialized, caseSensitive); - case 3: - return IcebergSourceSplit.deserializeV3(serialized, caseSensitive); - default: - throw new IOException( - String.format( - Locale.ROOT, - "Failed to deserialize IcebergSourceSplit. " - + "Encountered unsupported version: %d. Supported version are [1]", - version)); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java deleted file mode 100644 index d9061e049e00..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.split; - -public class IcebergSourceSplitState { - private final IcebergSourceSplit split; - private final IcebergSourceSplitStatus status; - - public IcebergSourceSplitState(IcebergSourceSplit split, IcebergSourceSplitStatus status) { - this.split = split; - this.status = status; - } - - public IcebergSourceSplit split() { - return split; - } - - public IcebergSourceSplitStatus status() { - return status; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java deleted file mode 100644 index d4a84a165e1a..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.split; - -public enum IcebergSourceSplitStatus { - UNASSIGNED, - ASSIGNED, - COMPLETED -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java deleted file mode 100644 index 319648ca275c..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.split; - -import java.io.Serializable; -import java.util.Comparator; - -public interface SerializableComparator extends Comparator, Serializable {} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java deleted file mode 100644 index 841969666ee5..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.split; - -import java.io.IOException; -import java.io.Serializable; -import java.io.UTFDataFormatException; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; - -/** - * Helper class to serialize and deserialize strings longer than 65K. The inspiration is mostly - * taken from the class org.apache.flink.core.memory.DataInputSerializer.readUTF and - * org.apache.flink.core.memory.DataOutputSerializer.writeUTF. - */ -class SerializerHelper implements Serializable { - - private SerializerHelper() {} - - /** - * Similar to {@link DataOutputSerializer#writeUTF(String)}. Except this supports larger payloads - * which is up to max integer value. - * - *

    Note: This method can be removed when the method which does similar thing within the {@link - * DataOutputSerializer} already which does the same thing, so use that one instead once that is - * released on Flink version 1.20. - * - *

    See * FLINK-34228 * https://github.com/apache/flink/pull/24191 - * - * @param out the output stream to write the string to. - * @param str the string value to be written. - */ - public static void writeLongUTF(DataOutputSerializer out, String str) throws IOException { - int strlen = str.length(); - long utflen = 0; - int ch; - - /* use charAt instead of copying String to char array */ - for (int i = 0; i < strlen; i++) { - ch = str.charAt(i); - utflen += getUTFBytesSize(ch); - - if (utflen > Integer.MAX_VALUE) { - throw new UTFDataFormatException("Encoded string reached maximum length: " + utflen); - } - } - - if (utflen > Integer.MAX_VALUE - 4) { - throw new UTFDataFormatException("Encoded string is too long: " + utflen); - } - - out.writeInt((int) utflen); - writeUTFBytes(out, str, (int) utflen); - } - - /** - * Similar to {@link DataInputDeserializer#readUTF()}. Except this supports larger payloads which - * is up to max integer value. - * - *

    Note: This method can be removed when the method which does similar thing within the {@link - * DataOutputSerializer} already which does the same thing, so use that one instead once that is - * released on Flink version 1.20. - * - *

    See * FLINK-34228 * https://github.com/apache/flink/pull/24191 - * - * @param in the input stream to read the string from. - * @return the string value read from the input stream. - * @throws IOException if an I/O error occurs when reading from the input stream. - */ - public static String readLongUTF(DataInputDeserializer in) throws IOException { - int utflen = in.readInt(); - byte[] bytearr = new byte[utflen]; - char[] chararr = new char[utflen]; - - int ch; - int char2; - int char3; - int count = 0; - int chararrCount = 0; - - in.readFully(bytearr, 0, utflen); - - while (count < utflen) { - ch = (int) bytearr[count] & 0xff; - if (ch > 127) { - break; - } - count++; - chararr[chararrCount++] = (char) ch; - } - - while (count < utflen) { - ch = (int) bytearr[count] & 0xff; - switch (ch >> 4) { - case 0: - case 1: - case 2: - case 3: - case 4: - case 5: - case 6: - case 7: - /* 0xxxxxxx */ - count++; - chararr[chararrCount++] = (char) ch; - break; - case 12: - case 13: - /* 110x xxxx 10xx xxxx */ - count += 2; - if (count > utflen) { - throw new UTFDataFormatException("malformed input: partial character at end"); - } - char2 = bytearr[count - 1]; - if ((char2 & 0xC0) != 0x80) { - throw new UTFDataFormatException("malformed input around byte " + count); - } - chararr[chararrCount++] = (char) (((ch & 0x1F) << 6) | (char2 & 0x3F)); - break; - case 14: - /* 1110 xxxx 10xx xxxx 10xx xxxx */ - count += 3; - if (count > utflen) { - throw new UTFDataFormatException("malformed input: partial character at end"); - } - char2 = bytearr[count - 2]; - char3 = bytearr[count - 1]; - if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) { - throw new UTFDataFormatException("malformed input around byte " + (count - 1)); - } - chararr[chararrCount++] = - (char) (((ch & 0x0F) << 12) | ((char2 & 0x3F) << 6) | (char3 & 0x3F)); - break; - default: - /* 10xx xxxx, 1111 xxxx */ - throw new UTFDataFormatException("malformed input around byte " + count); - } - } - // The number of chars produced may be less than utflen - return new String(chararr, 0, chararrCount); - } - - private static int getUTFBytesSize(int ch) { - if ((ch >= 0x0001) && (ch <= 0x007F)) { - return 1; - } else if (ch > 0x07FF) { - return 3; - } else { - return 2; - } - } - - private static void writeUTFBytes(DataOutputSerializer out, String str, int utflen) - throws IOException { - int strlen = str.length(); - int ch; - - int len = Math.max(1024, utflen); - - byte[] bytearr = new byte[len]; - int count = 0; - - int index; - for (index = 0; index < strlen; index++) { - ch = str.charAt(index); - if (!((ch >= 0x0001) && (ch <= 0x007F))) { - break; - } - bytearr[count++] = (byte) ch; - } - - for (; index < strlen; index++) { - ch = str.charAt(index); - if ((ch >= 0x0001) && (ch <= 0x007F)) { - bytearr[count++] = (byte) ch; - } else if (ch > 0x07FF) { - bytearr[count++] = (byte) (0xE0 | ((ch >> 12) & 0x0F)); - bytearr[count++] = (byte) (0x80 | ((ch >> 6) & 0x3F)); - bytearr[count++] = (byte) (0x80 | (ch & 0x3F)); - } else { - bytearr[count++] = (byte) (0xC0 | ((ch >> 6) & 0x1F)); - bytearr[count++] = (byte) (0x80 | (ch & 0x3F)); - } - } - - out.write(bytearr, 0, count); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java deleted file mode 100644 index 37bddfbb7182..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.split; - -import org.apache.iceberg.flink.FlinkReadOptions; -import org.apache.iceberg.flink.source.reader.SplitWatermarkExtractor; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * Provides implementations of {@link org.apache.iceberg.flink.source.split.SerializableComparator} - * which could be used for ordering splits. These are used by the {@link - * org.apache.iceberg.flink.source.assigner.OrderedSplitAssignerFactory} and the {@link - * org.apache.iceberg.flink.source.reader.IcebergSourceReader} - */ -public class SplitComparators { - private SplitComparators() {} - - /** Comparator which orders the splits based on the file sequence number of the data files */ - public static SerializableComparator fileSequenceNumber() { - return (IcebergSourceSplit o1, IcebergSourceSplit o2) -> { - Preconditions.checkArgument( - o1.task().files().size() == 1 && o2.task().files().size() == 1, - "Could not compare combined task. Please use '%s' to prevent combining multiple files to a split", - FlinkReadOptions.SPLIT_FILE_OPEN_COST); - - Long seq1 = o1.task().files().iterator().next().file().fileSequenceNumber(); - Long seq2 = o2.task().files().iterator().next().file().fileSequenceNumber(); - - Preconditions.checkNotNull( - seq1, - "Invalid file sequence number: null. Doesn't support splits written with V1 format: %s", - o1); - Preconditions.checkNotNull( - seq2, - "Invalid file sequence number: null. Doesn't support splits written with V1 format: %s", - o2); - - int temp = Long.compare(seq1, seq2); - if (temp != 0) { - return temp; - } else { - return o1.splitId().compareTo(o2.splitId()); - } - }; - } - - /** Comparator which orders the splits based on watermark of the splits */ - public static SerializableComparator watermark( - SplitWatermarkExtractor watermarkExtractor) { - return (IcebergSourceSplit o1, IcebergSourceSplit o2) -> { - long watermark1 = watermarkExtractor.extractWatermark(o1); - long watermark2 = watermarkExtractor.extractWatermark(o2); - - int temp = Long.compare(watermark1, watermark2); - if (temp != 0) { - return temp; - } else { - return o1.splitId().compareTo(o2.splitId()); - } - }; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java deleted file mode 100644 index eabd757aa638..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.split; - -import java.util.Collection; -import java.util.Collections; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.connector.source.SourceEvent; - -/** We can remove this class once FLINK-21364 is resolved. */ -@Internal -public class SplitRequestEvent implements SourceEvent { - private static final long serialVersionUID = 1L; - - private final Collection finishedSplitIds; - private final String requesterHostname; - - public SplitRequestEvent() { - this(Collections.emptyList()); - } - - public SplitRequestEvent(Collection finishedSplitIds) { - this(finishedSplitIds, null); - } - - public SplitRequestEvent(Collection finishedSplitIds, String requesterHostname) { - this.finishedSplitIds = finishedSplitIds; - this.requesterHostname = requesterHostname; - } - - public Collection finishedSplitIds() { - return finishedSplitIds; - } - - public String requesterHostname() { - return requesterHostname; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java deleted file mode 100644 index 6306e82d5729..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.util; - -import java.util.concurrent.TimeUnit; -import org.apache.flink.annotation.Internal; -import org.apache.flink.metrics.Gauge; - -/** - * This gauge measures the elapsed time between now and last recorded time set by {@link - * ElapsedTimeGauge#refreshLastRecordedTime()}. - */ -@Internal -public class ElapsedTimeGauge implements Gauge { - private final TimeUnit reportUnit; - private volatile long lastRecordedTimeNano; - - public ElapsedTimeGauge(TimeUnit timeUnit) { - this.reportUnit = timeUnit; - refreshLastRecordedTime(); - } - - public void refreshLastRecordedTime() { - this.lastRecordedTimeNano = System.nanoTime(); - } - - @Override - public Long getValue() { - return reportUnit.convert(System.nanoTime() - lastRecordedTimeNano, TimeUnit.NANOSECONDS); - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java deleted file mode 100644 index 2bbc9cf208fe..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.util; - -import java.util.List; -import java.util.Map; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.catalog.TableChange; -import org.apache.flink.table.catalog.UniqueConstraint; -import org.apache.iceberg.Table; -import org.apache.iceberg.Transaction; -import org.apache.iceberg.UpdateProperties; -import org.apache.iceberg.UpdateSchema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.types.Type; - -public class FlinkAlterTableUtil { - private FlinkAlterTableUtil() {} - - public static void commitChanges( - Table table, - String setLocation, - String setSnapshotId, - String pickSnapshotId, - Map setProperties) { - commitManageSnapshots(table, setSnapshotId, pickSnapshotId); - - Transaction transaction = table.newTransaction(); - - if (setLocation != null) { - transaction.updateLocation().setLocation(setLocation).commit(); - } - - if (!setProperties.isEmpty()) { - UpdateProperties updateProperties = transaction.updateProperties(); - setProperties.forEach( - (k, v) -> { - if (v == null) { - updateProperties.remove(k); - } else { - updateProperties.set(k, v); - } - }); - updateProperties.commit(); - } - - transaction.commitTransaction(); - } - - public static void commitChanges( - Table table, - String setLocation, - String setSnapshotId, - String pickSnapshotId, - List schemaChanges, - List propertyChanges) { - commitManageSnapshots(table, setSnapshotId, pickSnapshotId); - - Transaction transaction = table.newTransaction(); - - if (setLocation != null) { - transaction.updateLocation().setLocation(setLocation).commit(); - } - - if (!schemaChanges.isEmpty()) { - UpdateSchema updateSchema = transaction.updateSchema(); - FlinkAlterTableUtil.applySchemaChanges(updateSchema, schemaChanges); - updateSchema.commit(); - } - - if (!propertyChanges.isEmpty()) { - UpdateProperties updateProperties = transaction.updateProperties(); - FlinkAlterTableUtil.applyPropertyChanges(updateProperties, propertyChanges); - updateProperties.commit(); - } - - transaction.commitTransaction(); - } - - public static void commitManageSnapshots( - Table table, String setSnapshotId, String cherrypickSnapshotId) { - // don't allow setting the snapshot and picking a commit at the same time because order is - // ambiguous and choosing one order leads to different results - Preconditions.checkArgument( - setSnapshotId == null || cherrypickSnapshotId == null, - "Cannot set the current snapshot ID and cherry-pick snapshot changes"); - - if (setSnapshotId != null) { - long newSnapshotId = Long.parseLong(setSnapshotId); - table.manageSnapshots().setCurrentSnapshot(newSnapshotId).commit(); - } - - // if updating the table snapshot, perform that update first in case it fails - if (cherrypickSnapshotId != null) { - long newSnapshotId = Long.parseLong(cherrypickSnapshotId); - table.manageSnapshots().cherrypick(newSnapshotId).commit(); - } - } - - /** - * Applies a list of Flink table changes to an {@link UpdateSchema} operation. - * - * @param pendingUpdate an uncommitted UpdateSchema operation to configure - * @param schemaChanges a list of Flink table changes - */ - public static void applySchemaChanges( - UpdateSchema pendingUpdate, List schemaChanges) { - for (TableChange change : schemaChanges) { - if (change instanceof TableChange.AddColumn) { - TableChange.AddColumn addColumn = (TableChange.AddColumn) change; - Column flinkColumn = addColumn.getColumn(); - Preconditions.checkArgument( - FlinkCompatibilityUtil.isPhysicalColumn(flinkColumn), - "Unsupported table change: Adding computed column %s.", - flinkColumn.getName()); - Type icebergType = FlinkSchemaUtil.convert(flinkColumn.getDataType().getLogicalType()); - if (flinkColumn.getDataType().getLogicalType().isNullable()) { - pendingUpdate.addColumn( - flinkColumn.getName(), icebergType, flinkColumn.getComment().orElse(null)); - } else { - pendingUpdate.addRequiredColumn( - flinkColumn.getName(), icebergType, flinkColumn.getComment().orElse(null)); - } - } else if (change instanceof TableChange.ModifyColumn) { - TableChange.ModifyColumn modifyColumn = (TableChange.ModifyColumn) change; - applyModifyColumn(pendingUpdate, modifyColumn); - } else if (change instanceof TableChange.DropColumn) { - TableChange.DropColumn dropColumn = (TableChange.DropColumn) change; - pendingUpdate.deleteColumn(dropColumn.getColumnName()); - } else if (change instanceof TableChange.AddWatermark) { - throw new UnsupportedOperationException("Unsupported table change: AddWatermark."); - } else if (change instanceof TableChange.ModifyWatermark) { - throw new UnsupportedOperationException("Unsupported table change: ModifyWatermark."); - } else if (change instanceof TableChange.DropWatermark) { - throw new UnsupportedOperationException("Unsupported table change: DropWatermark."); - } else if (change instanceof TableChange.AddUniqueConstraint) { - TableChange.AddUniqueConstraint addPk = (TableChange.AddUniqueConstraint) change; - applyUniqueConstraint(pendingUpdate, addPk.getConstraint()); - } else if (change instanceof TableChange.ModifyUniqueConstraint) { - TableChange.ModifyUniqueConstraint modifyPk = (TableChange.ModifyUniqueConstraint) change; - applyUniqueConstraint(pendingUpdate, modifyPk.getNewConstraint()); - } else if (change instanceof TableChange.DropConstraint) { - throw new UnsupportedOperationException("Unsupported table change: DropConstraint."); - } else { - throw new UnsupportedOperationException("Cannot apply unknown table change: " + change); - } - } - } - - /** - * Applies a list of Flink table property changes to an {@link UpdateProperties} operation. - * - * @param pendingUpdate an uncommitted UpdateProperty operation to configure - * @param propertyChanges a list of Flink table changes - */ - public static void applyPropertyChanges( - UpdateProperties pendingUpdate, List propertyChanges) { - for (TableChange change : propertyChanges) { - if (change instanceof TableChange.SetOption) { - TableChange.SetOption setOption = (TableChange.SetOption) change; - pendingUpdate.set(setOption.getKey(), setOption.getValue()); - } else if (change instanceof TableChange.ResetOption) { - TableChange.ResetOption resetOption = (TableChange.ResetOption) change; - pendingUpdate.remove(resetOption.getKey()); - } else { - throw new UnsupportedOperationException( - "The given table change is not a property change: " + change); - } - } - } - - private static void applyModifyColumn( - UpdateSchema pendingUpdate, TableChange.ModifyColumn modifyColumn) { - if (modifyColumn instanceof TableChange.ModifyColumnName) { - TableChange.ModifyColumnName modifyName = (TableChange.ModifyColumnName) modifyColumn; - pendingUpdate.renameColumn(modifyName.getOldColumnName(), modifyName.getNewColumnName()); - } else if (modifyColumn instanceof TableChange.ModifyColumnPosition) { - TableChange.ModifyColumnPosition modifyPosition = - (TableChange.ModifyColumnPosition) modifyColumn; - applyModifyColumnPosition(pendingUpdate, modifyPosition); - } else if (modifyColumn instanceof TableChange.ModifyPhysicalColumnType) { - TableChange.ModifyPhysicalColumnType modifyType = - (TableChange.ModifyPhysicalColumnType) modifyColumn; - Type type = FlinkSchemaUtil.convert(modifyType.getNewType().getLogicalType()); - String columnName = modifyType.getOldColumn().getName(); - pendingUpdate.updateColumn(columnName, type.asPrimitiveType()); - if (modifyType.getNewColumn().getDataType().getLogicalType().isNullable()) { - pendingUpdate.makeColumnOptional(columnName); - } else { - pendingUpdate.requireColumn(columnName); - } - } else if (modifyColumn instanceof TableChange.ModifyColumnComment) { - TableChange.ModifyColumnComment modifyComment = - (TableChange.ModifyColumnComment) modifyColumn; - pendingUpdate.updateColumnDoc( - modifyComment.getOldColumn().getName(), modifyComment.getNewComment()); - } else { - throw new UnsupportedOperationException( - "Cannot apply unknown modify-column change: " + modifyColumn); - } - } - - private static void applyModifyColumnPosition( - UpdateSchema pendingUpdate, TableChange.ModifyColumnPosition modifyColumnPosition) { - TableChange.ColumnPosition newPosition = modifyColumnPosition.getNewPosition(); - if (newPosition instanceof TableChange.First) { - pendingUpdate.moveFirst(modifyColumnPosition.getOldColumn().getName()); - } else if (newPosition instanceof TableChange.After) { - TableChange.After after = (TableChange.After) newPosition; - pendingUpdate.moveAfter(modifyColumnPosition.getOldColumn().getName(), after.column()); - } else { - throw new UnsupportedOperationException( - "Cannot apply unknown modify-column-position change: " + modifyColumnPosition); - } - } - - private static void applyUniqueConstraint( - UpdateSchema pendingUpdate, UniqueConstraint constraint) { - switch (constraint.getType()) { - case PRIMARY_KEY: - pendingUpdate.setIdentifierFields(constraint.getColumns()); - break; - case UNIQUE_KEY: - throw new UnsupportedOperationException( - "Unsupported table change: setting unique key constraints."); - default: - throw new UnsupportedOperationException( - "Cannot apply unknown unique constraint: " + constraint.getType().name()); - } - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java deleted file mode 100644 index 50fc83dba8e2..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.util; - -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.table.api.Schema; -import org.apache.flink.table.api.TableColumn; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; -import org.apache.flink.table.types.logical.RowType; - -/** - * This is a small util class that try to hide calls to Flink Internal or PublicEvolve interfaces as - * Flink can change those APIs during minor version release. - */ -public class FlinkCompatibilityUtil { - - private FlinkCompatibilityUtil() {} - - public static TypeInformation toTypeInfo(RowType rowType) { - return InternalTypeInfo.of(rowType); - } - - /** - * @deprecated since 1.10.0, will be removed in 2.0.0. - */ - @Deprecated - public static boolean isPhysicalColumn(TableColumn column) { - return column.isPhysical(); - } - - public static boolean isPhysicalColumn(Column column) { - return column.isPhysical(); - } - - public static boolean isPhysicalColumn(Schema.UnresolvedColumn column) { - return column instanceof Schema.UnresolvedPhysicalColumn; - } -} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java deleted file mode 100644 index 20b33e615e5f..000000000000 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.util; - -import java.util.concurrent.atomic.AtomicReference; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; - -public class FlinkPackage { - - private static final AtomicReference VERSION = new AtomicReference<>(); - public static final String FLINK_UNKNOWN_VERSION = "FLINK-UNKNOWN-VERSION"; - - private FlinkPackage() {} - - /** Returns Flink version string like x.y.z */ - public static String version() { - if (null == VERSION.get()) { - String detectedVersion; - try { - detectedVersion = versionFromJar(); - // use unknown version in case exact implementation version can't be found from the jar - // (this can happen if the DataStream class appears multiple times in the same classpath - // such as with shading) - detectedVersion = detectedVersion != null ? detectedVersion : FLINK_UNKNOWN_VERSION; - } catch (Exception e) { - detectedVersion = FLINK_UNKNOWN_VERSION; - } - VERSION.set(detectedVersion); - } - - return VERSION.get(); - } - - @VisibleForTesting - static String versionFromJar() { - // Choose {@link DataStream} class because it is one of the core Flink API - return DataStream.class.getPackage().getImplementationVersion(); - } - - @VisibleForTesting - static void setVersion(String version) { - VERSION.set(version); - } -} diff --git a/flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory b/flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory deleted file mode 100644 index 29a9955a7e20..000000000000 --- a/flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -org.apache.iceberg.flink.FlinkDynamicTableFactory diff --git a/flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory b/flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory deleted file mode 100644 index 2b6bfa3cd579..000000000000 --- a/flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -org.apache.iceberg.flink.FlinkCatalogFactory diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java deleted file mode 100644 index 4184526a6a1a..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.junit.jupiter.api.Test; - -public abstract class AvroGenericRecordConverterBase { - protected abstract void testConverter(DataGenerator dataGenerator) throws Exception; - - @Test - public void testPrimitiveTypes() throws Exception { - testConverter(new DataGenerators.Primitives()); - } - - @Test - public void testStructOfPrimitive() throws Exception { - testConverter(new DataGenerators.StructOfPrimitive()); - } - - @Test - public void testStructOfArray() throws Exception { - testConverter(new DataGenerators.StructOfArray()); - } - - @Test - public void testStructOfMap() throws Exception { - testConverter(new DataGenerators.StructOfMap()); - } - - @Test - public void testStructOfStruct() throws Exception { - testConverter(new DataGenerators.StructOfStruct()); - } - - @Test - public void testArrayOfPrimitive() throws Exception { - testConverter(new DataGenerators.ArrayOfPrimitive()); - } - - @Test - public void testArrayOfArray() throws Exception { - testConverter(new DataGenerators.ArrayOfArray()); - } - - @Test - public void testArrayOfMap() throws Exception { - testConverter(new DataGenerators.ArrayOfMap()); - } - - @Test - public void testArrayOfStruct() throws Exception { - testConverter(new DataGenerators.ArrayOfStruct()); - } - - @Test - public void testMapOfPrimitives() throws Exception { - testConverter(new DataGenerators.MapOfPrimitives()); - } - - @Test - public void testMapOfArray() throws Exception { - testConverter(new DataGenerators.MapOfArray()); - } - - @Test - public void testMapOfMap() throws Exception { - testConverter(new DataGenerators.MapOfMap()); - } - - @Test - public void testMapOfStruct() throws Exception { - testConverter(new DataGenerators.MapOfStruct()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java deleted file mode 100644 index 062ff68d5d85..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.File; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import org.apache.flink.util.ArrayUtils; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.SupportsNamespaces; -import org.apache.iceberg.hadoop.HadoopCatalog; -import org.apache.iceberg.relocated.com.google.common.base.Joiner; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public abstract class CatalogTestBase extends TestBase { - - protected static final String DATABASE = "db"; - @TempDir protected File hiveWarehouse; - @TempDir protected File hadoopWarehouse; - - @Parameter(index = 0) - protected String catalogName; - - @Parameter(index = 1) - protected Namespace baseNamespace; - - protected Catalog validationCatalog; - protected SupportsNamespaces validationNamespaceCatalog; - protected Map config = Maps.newHashMap(); - - protected String flinkDatabase; - protected Namespace icebergNamespace; - protected boolean isHadoopCatalog; - - @Parameters(name = "catalogName={0}, baseNamespace={1}") - protected static List parameters() { - return Arrays.asList( - new Object[] {"testhive", Namespace.empty()}, - new Object[] {"testhadoop", Namespace.empty()}, - new Object[] {"testhadoop_basenamespace", Namespace.of("l0", "l1")}); - } - - @BeforeEach - public void before() { - this.isHadoopCatalog = catalogName.startsWith("testhadoop"); - this.validationCatalog = - isHadoopCatalog - ? new HadoopCatalog(hiveConf, "file:" + hadoopWarehouse.getPath()) - : catalog; - this.validationNamespaceCatalog = (SupportsNamespaces) validationCatalog; - - config.put("type", "iceberg"); - if (!baseNamespace.isEmpty()) { - config.put(FlinkCatalogFactory.BASE_NAMESPACE, baseNamespace.toString()); - } - if (isHadoopCatalog) { - config.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hadoop"); - } else { - config.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hive"); - config.put(CatalogProperties.URI, getURI(hiveConf)); - } - config.put(CatalogProperties.WAREHOUSE_LOCATION, String.format("file://%s", warehouseRoot())); - - this.flinkDatabase = catalogName + "." + DATABASE; - this.icebergNamespace = - Namespace.of(ArrayUtils.concat(baseNamespace.levels(), new String[] {DATABASE})); - sql("CREATE CATALOG %s WITH %s", catalogName, toWithClause(config)); - } - - @AfterEach - public void clean() { - dropCatalog(catalogName, true); - } - - protected String warehouseRoot() { - if (isHadoopCatalog) { - return hadoopWarehouse.getAbsolutePath(); - } else { - return hiveWarehouse.getAbsolutePath(); - } - } - - protected String getFullQualifiedTableName(String tableName) { - final List levels = Lists.newArrayList(icebergNamespace.levels()); - levels.add(tableName); - return Joiner.on('.').join(levels); - } - - static String getURI(HiveConf conf) { - return conf.get(HiveConf.ConfVars.METASTOREURIS.varname); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java deleted file mode 100644 index b1e3b20ff7ac..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.GenericRecord; - -/** - * This interface defines test data generator. Different implementations for primitive and complex - * nested fields are defined in {@link DataGenerators}. - */ -public interface DataGenerator { - Schema icebergSchema(); - - RowType flinkRowType(); - - org.apache.avro.Schema avroSchema(); - - GenericRecord generateIcebergGenericRecord(); - - GenericRowData generateFlinkRowData(); - - org.apache.avro.generic.GenericRecord generateAvroGenericRecord(); -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java deleted file mode 100644 index e2cd411d7069..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java +++ /dev/null @@ -1,1172 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.types.Types.NestedField.required; - -import com.fasterxml.jackson.databind.node.IntNode; -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.nio.charset.StandardCharsets; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.time.OffsetDateTime; -import java.time.ZoneOffset; -import java.util.Arrays; -import java.util.List; -import java.util.UUID; -import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; -import org.apache.avro.LogicalTypes; -import org.apache.avro.SchemaBuilder; -import org.apache.avro.generic.GenericData; -import org.apache.avro.util.Utf8; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.avro.AvroSchemaUtil; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Types; -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; -import org.joda.time.Days; - -/** - * Util class to generate test data with extensive coverage different field types: from primitives - * to complex nested types. - */ -public class DataGenerators { - - public static class Primitives implements DataGenerator { - private static final DateTime JODA_DATETIME_EPOC = - new DateTime(1970, 1, 1, 0, 0, 0, 0, DateTimeZone.UTC); - private static final DateTime JODA_DATETIME_20220110 = - new DateTime(2022, 1, 10, 0, 0, 0, 0, DateTimeZone.UTC); - private static final int DAYS_BTW_EPOC_AND_20220110 = - Days.daysBetween(JODA_DATETIME_EPOC, JODA_DATETIME_20220110).getDays(); - private static final int HOUR_8_IN_MILLI = (int) TimeUnit.HOURS.toMillis(8); - - private static final LocalDate JAVA_LOCAL_DATE_20220110 = LocalDate.of(2022, 1, 10); - private static final LocalTime JAVA_LOCAL_TIME_HOUR8 = LocalTime.of(8, 0); - private static final OffsetDateTime JAVA_OFFSET_DATE_TIME_20220110 = - OffsetDateTime.of(2022, 1, 10, 0, 0, 0, 0, ZoneOffset.UTC); - private static final LocalDateTime JAVA_LOCAL_DATE_TIME_20220110 = - LocalDateTime.of(2022, 1, 10, 0, 0, 0); - private static final BigDecimal BIG_DECIMAL_NEGATIVE = new BigDecimal("-1.50"); - private static final byte[] FIXED_BYTES = "012345689012345".getBytes(StandardCharsets.UTF_8); - - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - // primitive types - Types.NestedField.optional(2, "boolean_field", Types.BooleanType.get()), - Types.NestedField.optional(3, "int_field", Types.IntegerType.get()), - Types.NestedField.optional(4, "long_field", Types.LongType.get()), - Types.NestedField.optional(5, "float_field", Types.FloatType.get()), - Types.NestedField.optional(6, "double_field", Types.DoubleType.get()), - Types.NestedField.required(7, "string_field", Types.StringType.get()), - Types.NestedField.required(8, "date_field", Types.DateType.get()), - Types.NestedField.required(9, "time_field", Types.TimeType.get()), - Types.NestedField.required(10, "ts_with_zone_field", Types.TimestampType.withZone()), - Types.NestedField.required( - 11, "ts_without_zone_field", Types.TimestampType.withoutZone()), - Types.NestedField.required(12, "uuid_field", Types.UUIDType.get()), - Types.NestedField.required(13, "binary_field", Types.BinaryType.get()), - Types.NestedField.required(14, "decimal_field", Types.DecimalType.of(9, 2)), - Types.NestedField.required(15, "fixed_field", Types.FixedType.ofLength(16))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - /** - * Fix up Avro Schema that is converted from Iceberg Schema. - * - * @param schemaConvertedFromIceberg Avro Schema converted from Iceberg schema via {@link - * AvroSchemaUtil#convert(Schema, String)} - */ - private org.apache.avro.Schema fixupAvroSchemaConvertedFromIcebergSchema( - org.apache.avro.Schema schemaConvertedFromIceberg) { - List fixedFields = - schemaConvertedFromIceberg.getFields().stream() - .map( - field -> { - org.apache.avro.Schema.Field updatedField = field; - if (field.name().equals("time_field")) { - // Iceberg's AvroSchemaUtil uses timestamp-micros with Long value for time - // field, while AvroToRowDataConverters#convertToTime() always looks for - // Integer value assuming millis. The root problem is that - // AvroToRowDataConverters#createConverter() uses LogicalTypeRoot to - // determine converter and LogicalTypeRoot lost the timestamp precision - // carried by LogicalType like Time(6). - org.apache.avro.Schema fieldSchema = - LogicalTypes.timeMillis() - .addToSchema( - org.apache.avro.Schema.create(org.apache.avro.Schema.Type.INT)); - updatedField = new org.apache.avro.Schema.Field("time_field", fieldSchema); - } - - return new org.apache.avro.Schema.Field(updatedField, updatedField.schema()); - }) - .collect(Collectors.toList()); - return org.apache.avro.Schema.createRecord( - schemaConvertedFromIceberg.getName(), - schemaConvertedFromIceberg.getDoc(), - schemaConvertedFromIceberg.getNamespace(), - schemaConvertedFromIceberg.isError(), - fixedFields); - } - - private final org.apache.avro.Schema avroSchema = - fixupAvroSchemaConvertedFromIcebergSchema(AvroSchemaUtil.convert(icebergSchema, "table")); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField("boolean_field", false); - genericRecord.setField("int_field", Integer.MAX_VALUE); - genericRecord.setField("long_field", Long.MAX_VALUE); - genericRecord.setField("float_field", Float.MAX_VALUE); - genericRecord.setField("double_field", Double.MAX_VALUE); - genericRecord.setField("string_field", "str"); - - genericRecord.setField("date_field", JAVA_LOCAL_DATE_20220110); - genericRecord.setField("time_field", JAVA_LOCAL_TIME_HOUR8); - genericRecord.setField("ts_with_zone_field", JAVA_OFFSET_DATE_TIME_20220110); - genericRecord.setField("ts_without_zone_field", JAVA_LOCAL_DATE_TIME_20220110); - - byte[] uuidBytes = new byte[16]; - for (int i = 0; i < 16; ++i) { - uuidBytes[i] = (byte) i; - } - - genericRecord.setField("uuid_field", UUID.nameUUIDFromBytes(uuidBytes)); - - byte[] binaryBytes = new byte[7]; - for (int i = 0; i < 7; ++i) { - binaryBytes[i] = (byte) i; - } - genericRecord.setField("binary_field", ByteBuffer.wrap(binaryBytes)); - - genericRecord.setField("decimal_field", BIG_DECIMAL_NEGATIVE); - genericRecord.setField("fixed_field", FIXED_BYTES); - - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - byte[] uuidBytes = new byte[16]; - for (int i = 0; i < 16; ++i) { - uuidBytes[i] = (byte) i; - } - - byte[] binaryBytes = new byte[7]; - for (int i = 0; i < 7; ++i) { - binaryBytes[i] = (byte) i; - } - - return GenericRowData.of( - StringData.fromString("row_id_value"), - false, - Integer.MAX_VALUE, - Long.MAX_VALUE, - Float.MAX_VALUE, - Double.MAX_VALUE, - StringData.fromString("str"), - DAYS_BTW_EPOC_AND_20220110, - HOUR_8_IN_MILLI, - // Although Avro logical type for timestamp fields are in micro seconds, - // AvroToRowDataConverters only looks for long value in milliseconds. - TimestampData.fromEpochMillis(JODA_DATETIME_20220110.getMillis()), - TimestampData.fromEpochMillis(JODA_DATETIME_20220110.getMillis()), - uuidBytes, - binaryBytes, - DecimalData.fromBigDecimal(BIG_DECIMAL_NEGATIVE, 9, 2), - FIXED_BYTES); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", new Utf8("row_id_value")); - genericRecord.put("boolean_field", false); - genericRecord.put("int_field", Integer.MAX_VALUE); - genericRecord.put("long_field", Long.MAX_VALUE); - genericRecord.put("float_field", Float.MAX_VALUE); - genericRecord.put("double_field", Double.MAX_VALUE); - genericRecord.put("string_field", new Utf8("str")); - - genericRecord.put("date_field", DAYS_BTW_EPOC_AND_20220110); - genericRecord.put("time_field", HOUR_8_IN_MILLI); - // Although Avro logical type for timestamp fields are in micro seconds, - // AvroToRowDataConverters only looks for long value in milliseconds. - genericRecord.put("ts_with_zone_field", JODA_DATETIME_20220110.getMillis()); - genericRecord.put("ts_without_zone_field", JODA_DATETIME_20220110.getMillis()); - - byte[] uuidBytes = new byte[16]; - for (int i = 0; i < 16; ++i) { - uuidBytes[i] = (byte) i; - } - genericRecord.put("uuid_field", ByteBuffer.wrap(uuidBytes)); - - byte[] binaryBytes = new byte[7]; - for (int i = 0; i < 7; ++i) { - binaryBytes[i] = (byte) i; - } - genericRecord.put("binary_field", ByteBuffer.wrap(binaryBytes)); - - BigDecimal bigDecimal = new BigDecimal("-1.50"); - // unscaledValue().toByteArray() is to match the behavior of RowDataToAvroConverters from - // Flink for decimal type - genericRecord.put("decimal_field", ByteBuffer.wrap(bigDecimal.unscaledValue().toByteArray())); - - genericRecord.put("fixed_field", ByteBuffer.wrap(FIXED_BYTES)); - - return genericRecord; - } - } - - public static class StructOfPrimitive implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, - "struct_of_primitive", - Types.StructType.of( - required(101, "id", Types.IntegerType.get()), - required(102, "name", Types.StringType.get())))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - Schema structSchema = - new Schema(icebergSchema.findField("struct_of_primitive").type().asStructType().fields()); - GenericRecord struct = GenericRecord.create(structSchema); - struct.setField("id", 1); - struct.setField("name", "Jane"); - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField("struct_of_primitive", struct); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - return GenericRowData.of( - StringData.fromString("row_id_value"), - GenericRowData.of(1, StringData.fromString("Jane"))); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.Schema structSchema = avroSchema.getField("struct_of_primitive").schema(); - org.apache.avro.generic.GenericRecord struct = new GenericData.Record(structSchema); - struct.put("id", 1); - struct.put("name", "Jane"); - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put("struct_of_primitive", struct); - return genericRecord; - } - } - - public static class StructOfArray implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, - "struct_of_array", - Types.StructType.of( - required(101, "id", Types.IntegerType.get()), - required( - 102, "names", Types.ListType.ofRequired(201, Types.StringType.get()))))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - Schema structSchema = - new Schema(icebergSchema.findField("struct_of_array").type().asStructType().fields()); - GenericRecord struct = GenericRecord.create(structSchema); - struct.setField("id", 1); - struct.setField("names", Arrays.asList("Jane", "Joe")); - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField("struct_of_array", struct); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - StringData[] names = {StringData.fromString("Jane"), StringData.fromString("Joe")}; - return GenericRowData.of( - StringData.fromString("row_id_value"), GenericRowData.of(1, new GenericArrayData(names))); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.Schema structSchema = avroSchema.getField("struct_of_array").schema(); - org.apache.avro.generic.GenericRecord struct = new GenericData.Record(structSchema); - struct.put("id", 1); - struct.put("names", Arrays.asList("Jane", "Joe")); - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put("struct_of_array", struct); - return genericRecord; - } - } - - public static class StructOfMap implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, - "struct_of_map", - Types.StructType.of( - required(101, "id", Types.IntegerType.get()), - required( - 102, - "names", - Types.MapType.ofRequired( - 201, 202, Types.StringType.get(), Types.StringType.get()))))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - Schema structSchema = - new Schema(icebergSchema.findField("struct_of_map").type().asStructType().fields()); - GenericRecord struct = GenericRecord.create(structSchema); - struct.setField("id", 1); - struct.setField("names", ImmutableMap.of("Jane", "female", "Joe", "male")); - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField("struct_of_map", struct); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - return GenericRowData.of( - StringData.fromString("row_id_value"), - GenericRowData.of( - 1, - new GenericMapData( - ImmutableMap.of( - StringData.fromString("Jane"), - StringData.fromString("female"), - StringData.fromString("Joe"), - StringData.fromString("male"))))); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.Schema structSchema = avroSchema.getField("struct_of_map").schema(); - org.apache.avro.generic.GenericRecord struct = new GenericData.Record(structSchema); - struct.put("id", 1); - struct.put("names", ImmutableMap.of("Jane", new Utf8("female"), "Joe", new Utf8("male"))); - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put("struct_of_map", struct); - return genericRecord; - } - } - - public static class StructOfStruct implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, - "struct_of_struct", - Types.StructType.of( - required(101, "id", Types.IntegerType.get()), - required( - 102, - "person_struct", - Types.StructType.of( - Types.NestedField.required(201, "name", Types.StringType.get()), - Types.NestedField.required(202, "address", Types.StringType.get())))))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - Schema structSchema = - new Schema(icebergSchema.findField("struct_of_struct").type().asStructType().fields()); - Schema personSchema = - new Schema(structSchema.findField("person_struct").type().asStructType().fields()); - GenericRecord person = GenericRecord.create(personSchema); - person.setField("name", "Jane"); - person.setField("address", "Apple Park"); - GenericRecord struct = GenericRecord.create(structSchema); - struct.setField("id", 1); - struct.setField("person_struct", person); - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField("struct_of_struct", struct); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - return GenericRowData.of( - StringData.fromString("row_id_value"), - GenericRowData.of( - 1, - GenericRowData.of( - StringData.fromString("Jane"), StringData.fromString("Apple Park")))); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.Schema structSchema = avroSchema.getField("struct_of_struct").schema(); - org.apache.avro.Schema personSchema = structSchema.getField("person_struct").schema(); - org.apache.avro.generic.GenericRecord person = new GenericData.Record(personSchema); - person.put("name", "Jane"); - person.put("address", "Apple Park"); - org.apache.avro.generic.GenericRecord struct = new GenericData.Record(structSchema); - struct.put("id", 1); - struct.put("person_struct", person); - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put("struct_of_struct", struct); - return genericRecord; - } - } - - public static class ArrayOfPrimitive implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, "array_of_int", Types.ListType.ofOptional(101, Types.IntegerType.get()))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField("array_of_int", Arrays.asList(1, 2, 3)); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - Integer[] arr = {1, 2, 3}; - return GenericRowData.of(StringData.fromString("row_id_value"), new GenericArrayData(arr)); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put("array_of_int", Arrays.asList(1, 2, 3)); - return genericRecord; - } - } - - public static class ArrayOfArray implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, - "array_of_array", - Types.ListType.ofRequired( - 101, Types.ListType.ofRequired(201, Types.IntegerType.get())))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField( - "array_of_array", Arrays.asList(Arrays.asList(1, 2, 3), Arrays.asList(4, 5, 6))); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - // non-primitive - Integer[] array1 = {1, 2, 3}; - Integer[] array2 = {4, 5, 6}; - GenericArrayData[] arrayOfArrays = { - new GenericArrayData(array1), new GenericArrayData(array2) - }; - return GenericRowData.of( - StringData.fromString("row_id_value"), new GenericArrayData(arrayOfArrays)); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put( - "array_of_array", Arrays.asList(Arrays.asList(1, 2, 3), Arrays.asList(4, 5, 6))); - return genericRecord; - } - } - - public static class ArrayOfMap implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, - "array_of_map", - Types.ListType.ofRequired( - 101, - Types.MapType.ofRequired( - 201, 202, Types.StringType.get(), Types.IntegerType.get())))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField( - "array_of_map", - Arrays.asList( - ImmutableMap.of("Jane", 1, "Joe", 2), ImmutableMap.of("Alice", 3, "Bob", 4))); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - GenericMapData[] array = { - new GenericMapData( - ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2)), - new GenericMapData( - ImmutableMap.of(StringData.fromString("Alice"), 3, StringData.fromString("Bob"), 4)) - }; - return GenericRowData.of(StringData.fromString("row_id_value"), new GenericArrayData(array)); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put( - "array_of_map", - Arrays.asList( - ImmutableMap.of("Jane", 1, "Joe", 2), ImmutableMap.of("Alice", 3, "Bob", 4))); - return genericRecord; - } - } - - public static class ArrayOfStruct implements DataGenerator { - private final Types.StructType structType = - Types.StructType.of( - required(201, "id", Types.IntegerType.get()), - required(202, "name", Types.StringType.get())); - private final Schema structIcebergSchema = new Schema(structType.fields()); - private final org.apache.avro.Schema structAvroSchema = - AvroSchemaUtil.convert(structIcebergSchema, "struct"); - - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.optional( - 2, "array_of_struct", Types.ListType.ofRequired(101, structType))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - GenericRecord struct1 = GenericRecord.create(structIcebergSchema); - struct1.setField("id", 1); - struct1.setField("name", "Jane"); - GenericRecord struct2 = GenericRecord.create(structIcebergSchema); - struct2.setField("id", 2); - struct2.setField("name", "Joe"); - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField("array_of_struct", Arrays.asList(struct1, struct2)); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - GenericRowData[] structArray = { - GenericRowData.of(1, StringData.fromString("Jane")), - GenericRowData.of(2, StringData.fromString("Joe")) - }; - return GenericRowData.of( - StringData.fromString("row_id_value"), new GenericArrayData(structArray)); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.generic.GenericRecord struct1 = new GenericData.Record(structAvroSchema); - struct1.put("id", 1); - struct1.put("name", "Jane"); - org.apache.avro.generic.GenericRecord struct2 = new GenericData.Record(structAvroSchema); - struct2.put("id", 2); - struct2.put("name", "Joe"); - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put("array_of_struct", Arrays.asList(struct1, struct2)); - return genericRecord; - } - } - - public static class MapOfPrimitives implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.optional( - 2, - "map_of_primitives", - Types.MapType.ofRequired( - 101, 102, Types.StringType.get(), Types.IntegerType.get()))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - return GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericMapData( - ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2))); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); - return genericRecord; - } - } - - public static class MapOfArray implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, - "map_of_array", - Types.MapType.ofRequired( - 101, - 102, - Types.StringType.get(), - Types.ListType.ofRequired(201, Types.IntegerType.get())))); - - private final RowType rowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return rowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField( - "map_of_array", - ImmutableMap.of( - "Jane", Arrays.asList(1, 2, 3), - "Joe", Arrays.asList(4, 5, 6))); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - Integer[] janeArray = {1, 2, 3}; - Integer[] joeArray = {4, 5, 6}; - return GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericMapData( - ImmutableMap.of( - StringData.fromString("Jane"), - new GenericArrayData(janeArray), - StringData.fromString("Joe"), - new GenericArrayData(joeArray)))); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put( - "map_of_array", - ImmutableMap.of( - "Jane", Arrays.asList(1, 2, 3), - "Joe", Arrays.asList(4, 5, 6))); - return genericRecord; - } - } - - public static class MapOfMap implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, - "map_of_map", - Types.MapType.ofRequired( - 101, - 102, - Types.StringType.get(), - Types.MapType.ofRequired( - 301, 302, Types.StringType.get(), Types.IntegerType.get())))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField( - "map_of_map", - ImmutableMap.of( - "female", ImmutableMap.of("Jane", 1, "Alice", 2), - "male", ImmutableMap.of("Joe", 3, "Bob", 4))); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - return GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericMapData( - ImmutableMap.of( - StringData.fromString("female"), - new GenericMapData( - ImmutableMap.of( - StringData.fromString("Jane"), 1, StringData.fromString("Alice"), 2)), - StringData.fromString("male"), - new GenericMapData( - ImmutableMap.of( - StringData.fromString("Joe"), 3, StringData.fromString("Bob"), 4))))); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put( - "map_of_map", - ImmutableMap.of( - "female", ImmutableMap.of("Jane", 1, "Alice", 2), - "male", ImmutableMap.of("Joe", 3, "Bob", 4))); - return genericRecord; - } - } - - public static class MapOfStruct implements DataGenerator { - private org.apache.avro.Schema createAvroSchemaIdField() { - org.apache.avro.Schema schema = SchemaBuilder.builder().intType(); - // this is needed to match the converter generated schema props - schema.addProp("field-id", IntNode.valueOf(201)); - return schema; - } - - private org.apache.avro.Schema createAvroSchemaNameField() { - org.apache.avro.Schema schema = SchemaBuilder.builder().stringType(); - // this is needed to match the converter generated schema props - schema.addProp("field-id", IntNode.valueOf(202)); - return schema; - } - - private final Types.StructType structType = - Types.StructType.of( - required(201, "id", Types.IntegerType.get()), - required(202, "name", Types.StringType.get())); - private final Schema structIcebergSchema = new Schema(structType.fields()); - - private final org.apache.avro.Schema structAvroSchema = - SchemaBuilder.builder() - .record("struct") - .fields() - .name("id") - .type(createAvroSchemaIdField()) - .noDefault() - .name("name") - .type(createAvroSchemaNameField()) - .noDefault() - .endRecord(); - - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, - "map_of_struct", - Types.MapType.ofRequired(101, 102, Types.StringType.get(), structType))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - // Can't use AvroSchemaUtil.convert otherwise the nested schema will have generated name like - // `r102` not the specified name like `struct`. - org.apache.avro.Schema avroSchema = - SchemaBuilder.builder() - .record("table") - .fields() - .requiredString("row_id") - .name("map_of_struct") - .type(SchemaBuilder.builder().map().values(structAvroSchema)) - .noDefault() - .endRecord(); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - GenericRecord struct1 = GenericRecord.create(structIcebergSchema); - struct1.setField("id", 1); - struct1.setField("name", "Jane"); - GenericRecord struct2 = GenericRecord.create(structIcebergSchema); - struct2.setField("id", 2); - struct2.setField("name", "Joe"); - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField( - "map_of_struct", ImmutableMap.of("struct1", struct1, "struct2", struct2)); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - return GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericMapData( - ImmutableMap.of( - StringData.fromString("struct1"), - GenericRowData.of(1, StringData.fromString("Jane")), - StringData.fromString("struct2"), - GenericRowData.of(2, StringData.fromString("Joe"))))); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.generic.GenericRecord struct1 = new GenericData.Record(structAvroSchema); - struct1.put("id", 1); - struct1.put("name", new Utf8("Jane")); - org.apache.avro.generic.GenericRecord struct2 = new GenericData.Record(structAvroSchema); - struct2.put("id", 2); - struct2.put("name", new Utf8("Joe")); - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", new Utf8("row_id_value")); - genericRecord.put("map_of_struct", ImmutableMap.of("struct1", struct1, "struct2", struct2)); - return genericRecord; - } - } - - public static class MapOfStructStruct implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.optional( - 2, - "map", - Types.MapType.ofOptional( - 101, - 102, - Types.StructType.of( - Types.NestedField.required(201, "key", Types.LongType.get()), - Types.NestedField.optional(202, "keyData", Types.StringType.get())), - Types.StructType.of( - Types.NestedField.required(203, "value", Types.LongType.get()), - Types.NestedField.optional(204, "valueData", Types.StringType.get()))))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - throw new UnsupportedOperationException( - "Not applicable as Avro Map only support string key type"); - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - throw new UnsupportedOperationException("Not implemented yet"); - } - - @Override - public GenericRowData generateFlinkRowData() { - return GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericMapData( - ImmutableMap.of( - GenericRowData.of(1L, StringData.fromString("key_data")), - GenericRowData.of(1L, StringData.fromString("value_data"))))); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - throw new UnsupportedOperationException("Avro Map only support string key type"); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java deleted file mode 100644 index fd5c6b76b683..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.UUID; -import org.apache.commons.io.FileUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.hadoop.HadoopCatalog; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.extension.AfterAllCallback; -import org.junit.jupiter.api.extension.AfterEachCallback; -import org.junit.jupiter.api.extension.BeforeAllCallback; -import org.junit.jupiter.api.extension.BeforeEachCallback; -import org.junit.jupiter.api.extension.ExtensionContext; - -public class HadoopCatalogExtension - implements BeforeAllCallback, BeforeEachCallback, AfterAllCallback, AfterEachCallback { - protected final String database; - protected final String tableName; - - protected Path temporaryFolder; - protected Catalog catalog; - protected CatalogLoader catalogLoader; - protected String warehouse; - protected TableLoader tableLoader; - - public HadoopCatalogExtension(String database, String tableName) { - this.database = database; - this.tableName = tableName; - } - - @Override - public void beforeAll(ExtensionContext context) throws Exception { - this.temporaryFolder = Files.createTempDirectory("junit5_hadoop_catalog-"); - } - - @Override - public void afterAll(ExtensionContext context) throws Exception { - FileUtils.deleteDirectory(temporaryFolder.toFile()); - } - - @Override - public void beforeEach(ExtensionContext context) throws Exception { - assertThat(temporaryFolder).exists().isDirectory(); - this.warehouse = "file:" + temporaryFolder + "/" + UUID.randomUUID(); - this.catalogLoader = - CatalogLoader.hadoop( - "hadoop", - new Configuration(), - ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouse)); - this.catalog = catalogLoader.loadCatalog(); - this.tableLoader = - TableLoader.fromCatalog(catalogLoader, TableIdentifier.of(database, tableName)); - } - - @Override - public void afterEach(ExtensionContext context) throws Exception { - try { - catalog.dropTable(TableIdentifier.of(database, tableName)); - ((HadoopCatalog) catalog).close(); - tableLoader.close(); - } catch (Exception e) { - throw new RuntimeException("Failed to close catalog resource"); - } - } - - public TableLoader tableLoader() { - return tableLoader; - } - - public Catalog catalog() { - return catalog; - } - - public CatalogLoader catalogLoader() { - return catalogLoader; - } - - public String warehouse() { - return warehouse; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java deleted file mode 100644 index dc6ef400a4a9..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.junit.jupiter.api.extension.ExtensionContext; - -public class HadoopTableExtension extends HadoopCatalogExtension { - private final Schema schema; - private final PartitionSpec partitionSpec; - - private Table table; - - public HadoopTableExtension(String database, String tableName, Schema schema) { - this(database, tableName, schema, null); - } - - public HadoopTableExtension( - String database, String tableName, Schema schema, PartitionSpec partitionSpec) { - super(database, tableName); - this.schema = schema; - this.partitionSpec = partitionSpec; - } - - @Override - public void beforeEach(ExtensionContext context) throws Exception { - super.beforeEach(context); - if (partitionSpec == null) { - this.table = catalog.createTable(TableIdentifier.of(database, tableName), schema); - } else { - this.table = - catalog.createTable(TableIdentifier.of(database, tableName), schema, partitionSpec); - } - tableLoader.open(); - } - - public Table table() { - return table; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java deleted file mode 100644 index d2e086aa448e..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.runtime.testutils.InMemoryReporter; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.test.junit5.MiniClusterExtension; - -public class MiniFlinkClusterExtension { - - private static final int DEFAULT_TM_NUM = 1; - private static final int DEFAULT_PARALLELISM = 4; - - public static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = - new Configuration() - // disable classloader check as Avro may cache class/object in the serializers. - .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - - private MiniFlinkClusterExtension() {} - - /** - * It will start a mini cluster with classloader.check-leaked-classloader=false, so that we won't - * break the unit tests because of the class loader leak issue. In our iceberg integration tests, - * there're some that will assert the results after finished the flink jobs, so actually we may - * access the class loader that has been closed by the flink task managers if we enable the switch - * classloader.check-leaked-classloader by default. - */ - public static MiniClusterExtension createWithClassloaderCheckDisabled() { - return new MiniClusterExtension( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(DEFAULT_TM_NUM) - .setNumberSlotsPerTaskManager(DEFAULT_PARALLELISM) - .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) - .build()); - } - - public static MiniClusterExtension createWithClassloaderCheckDisabled( - InMemoryReporter inMemoryReporter) { - Configuration configuration = new Configuration(DISABLE_CLASSLOADER_CHECK_CONFIG); - inMemoryReporter.addToConfiguration(configuration); - - return new MiniClusterExtension( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(DEFAULT_TM_NUM) - .setNumberSlotsPerTaskManager(DEFAULT_PARALLELISM) - .setConfiguration(configuration) - .build()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java deleted file mode 100644 index a79406b75cf2..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.time.Instant; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.time.OffsetDateTime; -import java.time.ZoneOffset; -import java.time.temporal.ChronoUnit; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import java.util.concurrent.TimeUnit; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; - -public class RowDataConverter { - private static final OffsetDateTime EPOCH = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC); - private static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); - - private RowDataConverter() {} - - public static RowData convert(Schema iSchema, Record record) { - return convert(iSchema.asStruct(), record); - } - - private static RowData convert(Types.StructType struct, Record record) { - GenericRowData rowData = new GenericRowData(struct.fields().size()); - List fields = struct.fields(); - for (int i = 0; i < fields.size(); i += 1) { - Types.NestedField field = fields.get(i); - - Type fieldType = field.type(); - rowData.setField(i, convert(fieldType, record.get(i))); - } - return rowData; - } - - private static Object convert(Type type, Object object) { - if (object == null) { - return null; - } - - switch (type.typeId()) { - case BOOLEAN: - case INTEGER: - case LONG: - case FLOAT: - case DOUBLE: - case FIXED: - return object; - case DATE: - return (int) ChronoUnit.DAYS.between(EPOCH_DAY, (LocalDate) object); - case TIME: - // Iceberg's time is in microseconds, while flink's time is in milliseconds. - LocalTime localTime = (LocalTime) object; - return (int) TimeUnit.NANOSECONDS.toMillis(localTime.toNanoOfDay()); - case TIMESTAMP: - return convertTimestamp(object, ((Types.TimestampType) type).shouldAdjustToUTC()); - case TIMESTAMP_NANO: - return convertTimestamp(object, ((Types.TimestampNanoType) type).shouldAdjustToUTC()); - case STRING: - return StringData.fromString((String) object); - case UUID: - UUID uuid = (UUID) object; - ByteBuffer bb = ByteBuffer.allocate(16); - bb.putLong(uuid.getMostSignificantBits()); - bb.putLong(uuid.getLeastSignificantBits()); - return bb.array(); - case BINARY: - ByteBuffer buffer = (ByteBuffer) object; - return Arrays.copyOfRange( - buffer.array(), - buffer.arrayOffset() + buffer.position(), - buffer.arrayOffset() + buffer.remaining()); - case DECIMAL: - Types.DecimalType decimalType = (Types.DecimalType) type; - return DecimalData.fromBigDecimal( - (BigDecimal) object, decimalType.precision(), decimalType.scale()); - case STRUCT: - return convert(type.asStructType(), (Record) object); - case LIST: - List list = (List) object; - Object[] convertedArray = new Object[list.size()]; - for (int i = 0; i < convertedArray.length; i++) { - convertedArray[i] = convert(type.asListType().elementType(), list.get(i)); - } - return new GenericArrayData(convertedArray); - case MAP: - Map convertedMap = Maps.newLinkedHashMap(); - Map map = (Map) object; - for (Map.Entry entry : map.entrySet()) { - convertedMap.put( - convert(type.asMapType().keyType(), entry.getKey()), - convert(type.asMapType().valueType(), entry.getValue())); - } - return new GenericMapData(convertedMap); - default: - throw new UnsupportedOperationException("Not a supported type: " + type); - } - } - - private static TimestampData convertTimestamp(Object timestamp, boolean shouldAdjustToUTC) { - if (shouldAdjustToUTC) { - return TimestampData.fromEpochMillis( - ((OffsetDateTime) timestamp).toInstant().toEpochMilli(), - ((OffsetDateTime) timestamp).getNano() % 1_000_000); - } else { - return TimestampData.fromEpochMillis( - ((LocalDateTime) timestamp).toInstant(ZoneOffset.UTC).toEpochMilli(), - ((LocalDateTime) timestamp).getNano() % 1_000_000); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java deleted file mode 100644 index ac15add88353..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java +++ /dev/null @@ -1,469 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.hadoop.HadoopOutputFile.fromPath; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.time.Duration; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.RowKind; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableScan; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.IcebergGenerics; -import org.apache.iceberg.data.InternalRecordWrapper; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.deletes.EqualityDeleteWriter; -import org.apache.iceberg.deletes.PositionDelete; -import org.apache.iceberg.deletes.PositionDeleteWriter; -import org.apache.iceberg.encryption.EncryptedOutputFile; -import org.apache.iceberg.flink.sink.FlinkAppenderFactory; -import org.apache.iceberg.hadoop.HadoopInputFile; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.Pair; -import org.apache.iceberg.util.StructLikeSet; -import org.apache.iceberg.util.StructLikeWrapper; -import org.awaitility.Awaitility; - -public class SimpleDataUtil { - - private SimpleDataUtil() {} - - public static final Schema SCHEMA = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); - - public static final Schema SCHEMA2 = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "extra", Types.StringType.get())); - - public static final ResolvedSchema FLINK_SCHEMA = - ResolvedSchema.of( - Column.physical("id", DataTypes.INT()), Column.physical("data", DataTypes.STRING())); - - public static final TableSchema FLINK_TABLE_SCHEMA = TableSchema.fromResolvedSchema(FLINK_SCHEMA); - - public static final RowType ROW_TYPE = - (RowType) FLINK_SCHEMA.toSourceRowDataType().getLogicalType(); - - public static final Record RECORD = GenericRecord.create(SCHEMA); - public static final Record RECORD2 = GenericRecord.create(SCHEMA2); - - public static Table createTable( - String path, Map properties, boolean partitioned) { - PartitionSpec spec; - if (partitioned) { - spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); - } else { - spec = PartitionSpec.unpartitioned(); - } - return new HadoopTables().create(SCHEMA, spec, properties, path); - } - - public static Record createRecord(Integer id, String data) { - Record record = RECORD.copy(); - record.setField("id", id); - record.setField("data", data); - return record; - } - - public static Record createRecord(Integer id, String data, String extra) { - Record record = RECORD2.copy(); - record.setField("id", id); - record.setField("data", data); - record.setField("extra", extra); - return record; - } - - public static RowData createRowData(Integer id, String data) { - return GenericRowData.of(id, StringData.fromString(data)); - } - - public static RowData createInsert(Integer id, String data) { - return GenericRowData.ofKind(RowKind.INSERT, id, StringData.fromString(data)); - } - - public static RowData createDelete(Integer id, String data) { - return GenericRowData.ofKind(RowKind.DELETE, id, StringData.fromString(data)); - } - - public static RowData createUpdateBefore(Integer id, String data) { - return GenericRowData.ofKind(RowKind.UPDATE_BEFORE, id, StringData.fromString(data)); - } - - public static RowData createUpdateAfter(Integer id, String data) { - return GenericRowData.ofKind(RowKind.UPDATE_AFTER, id, StringData.fromString(data)); - } - - public static DataFile writeFile( - Table table, - Schema schema, - PartitionSpec spec, - Configuration conf, - String location, - String filename, - List rows) - throws IOException { - return writeFile(table, schema, spec, conf, location, filename, rows, null); - } - - /** Write the list of {@link RowData} to the given path and with the given partition data */ - public static DataFile writeFile( - Table table, - Schema schema, - PartitionSpec spec, - Configuration conf, - String location, - String filename, - List rows, - StructLike partition) - throws IOException { - Path path = new Path(location, filename); - FileFormat fileFormat = FileFormat.fromFileName(filename); - Preconditions.checkNotNull(fileFormat, "Cannot determine format for file: %s", filename); - - RowType flinkSchema = FlinkSchemaUtil.convert(schema); - FileAppenderFactory appenderFactory = - new FlinkAppenderFactory( - table, schema, flinkSchema, ImmutableMap.of(), spec, null, null, null); - - FileAppender appender = appenderFactory.newAppender(fromPath(path, conf), fileFormat); - try (FileAppender closeableAppender = appender) { - closeableAppender.addAll(rows); - } - - DataFiles.Builder builder = - DataFiles.builder(spec) - .withInputFile(HadoopInputFile.fromPath(path, conf)) - .withMetrics(appender.metrics()); - - if (partition != null) { - builder = builder.withPartition(partition); - } - - return builder.build(); - } - - public static DeleteFile writeEqDeleteFile( - Table table, - FileFormat format, - String filename, - FileAppenderFactory appenderFactory, - List deletes) - throws IOException { - EncryptedOutputFile outputFile = - table - .encryption() - .encrypt(fromPath(new Path(table.location(), filename), new Configuration())); - - EqualityDeleteWriter eqWriter = - appenderFactory.newEqDeleteWriter(outputFile, format, null); - try (EqualityDeleteWriter writer = eqWriter) { - writer.write(deletes); - } - return eqWriter.toDeleteFile(); - } - - public static DeleteFile writePosDeleteFile( - Table table, - FileFormat format, - String filename, - FileAppenderFactory appenderFactory, - List> positions) - throws IOException { - EncryptedOutputFile outputFile = - table - .encryption() - .encrypt(fromPath(new Path(table.location(), filename), new Configuration())); - - PositionDeleteWriter posWriter = - appenderFactory.newPosDeleteWriter(outputFile, format, null); - PositionDelete posDelete = PositionDelete.create(); - try (PositionDeleteWriter writer = posWriter) { - for (Pair p : positions) { - writer.write(posDelete.set(p.first(), p.second(), null)); - } - } - return posWriter.toDeleteFile(); - } - - private static List convertToRecords(List rows) { - List records = Lists.newArrayList(); - for (RowData row : rows) { - Integer id = row.isNullAt(0) ? null : row.getInt(0); - String data = row.isNullAt(1) ? null : row.getString(1).toString(); - if (row.getArity() == 2) { - records.add(createRecord(id, data)); - } else { - String extra = row.isNullAt(2) ? null : row.getString(2).toString(); - records.add(createRecord(id, data, extra)); - } - } - return records; - } - - public static void assertTableRows(String tablePath, List expected, String branch) - throws IOException { - assertTableRecords(tablePath, convertToRecords(expected), branch); - } - - public static void assertTableRows(Table table, List expected) throws IOException { - assertTableRecords(table, convertToRecords(expected), SnapshotRef.MAIN_BRANCH); - } - - public static void assertTableRows(Table table, List expected, String branch) - throws IOException { - assertTableRecords(table, convertToRecords(expected), branch); - } - - /** Get all rows for a table */ - public static List tableRecords(Table table) throws IOException { - table.refresh(); - List records = Lists.newArrayList(); - try (CloseableIterable iterable = IcebergGenerics.read(table).build()) { - for (Record record : iterable) { - records.add(record); - } - } - return records; - } - - public static boolean equalsRecords(List expected, List actual, Schema schema) { - if (expected.size() != actual.size()) { - return false; - } - Types.StructType type = schema.asStruct(); - StructLikeSet expectedSet = StructLikeSet.create(type); - expectedSet.addAll(expected); - StructLikeSet actualSet = StructLikeSet.create(type); - actualSet.addAll(actual); - return expectedSet.equals(actualSet); - } - - public static void assertRecordsEqual(List expected, List actual, Schema schema) { - assertThat(actual).hasSameSizeAs(expected); - Types.StructType type = schema.asStruct(); - StructLikeSet expectedSet = StructLikeSet.create(type); - expectedSet.addAll(expected); - StructLikeSet actualSet = StructLikeSet.create(type); - actualSet.addAll(actual); - assertThat(actualSet).containsExactlyInAnyOrderElementsOf(expectedSet); - } - - /** - * Assert table contains the expected list of records after waiting up to the configured {@code - * timeout} - */ - public static void assertTableRecords(Table table, List expected, Duration timeout) { - Awaitility.await("expected list of records should be produced") - .atMost(timeout) - .untilAsserted(() -> assertRecordsEqual(expected, tableRecords(table), table.schema())); - } - - public static void assertTableRecords(Table table, List expected) throws IOException { - assertTableRecords(table, expected, SnapshotRef.MAIN_BRANCH); - } - - public static void assertTableRecords(Table table, List expected, String branch) - throws IOException { - table.refresh(); - Snapshot snapshot = latestSnapshot(table, branch); - - if (snapshot == null) { - assertThat(expected) - .as( - "No snapshot for table '%s', assuming expected data is empty. If that's not the case, the Flink job most likely did not checkpoint.", - table.name()) - .isEmpty(); - return; - } - - Types.StructType type = table.schema().asStruct(); - StructLikeSet expectedSet = StructLikeSet.create(type); - expectedSet.addAll(expected); - - try (CloseableIterable iterable = - IcebergGenerics.read(table).useSnapshot(snapshot.snapshotId()).build()) { - StructLikeSet actualSet = StructLikeSet.create(type); - - for (Record record : iterable) { - actualSet.add(record); - } - - assertThat(actualSet).containsExactlyInAnyOrderElementsOf(expectedSet); - } - } - - // Returns the latest snapshot of the given branch in the table - public static Snapshot latestSnapshot(Table table, String branch) { - // For the main branch, currentSnapshot() is used to validate that the API behavior has - // not changed since that was the API used for validation prior to addition of branches. - if (branch.equals(SnapshotRef.MAIN_BRANCH)) { - return table.currentSnapshot(); - } - - return table.snapshot(branch); - } - - public static void assertTableRecords(String tablePath, List expected) - throws IOException { - Preconditions.checkArgument(expected != null, "expected records shouldn't be null"); - assertTableRecords(new HadoopTables().load(tablePath), expected, SnapshotRef.MAIN_BRANCH); - } - - public static void assertTableRecords(String tablePath, List expected, String branch) - throws IOException { - Preconditions.checkArgument(expected != null, "expected records shouldn't be null"); - assertTableRecords(new HadoopTables().load(tablePath), expected, branch); - } - - public static StructLikeSet expectedRowSet(Table table, Record... records) { - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - InternalRecordWrapper wrapper = new InternalRecordWrapper(table.schema().asStruct()); - for (Record record : records) { - set.add(wrapper.copyFor(record)); - } - return set; - } - - public static StructLikeSet actualRowSet(Table table, String... columns) throws IOException { - return actualRowSet(table, null, columns); - } - - public static StructLikeSet actualRowSet(Table table, Long snapshotId, String... columns) - throws IOException { - table.refresh(); - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - InternalRecordWrapper wrapper = new InternalRecordWrapper(table.schema().asStruct()); - try (CloseableIterable reader = - IcebergGenerics.read(table) - .useSnapshot(snapshotId == null ? table.currentSnapshot().snapshotId() : snapshotId) - .select(columns) - .build()) { - reader.forEach(record -> set.add(wrapper.copyFor(record))); - } - return set; - } - - public static List partitionDataFiles(Table table, Map partitionValues) - throws IOException { - table.refresh(); - Types.StructType partitionType = table.spec().partitionType(); - - Record partitionRecord = GenericRecord.create(partitionType).copy(partitionValues); - StructLikeWrapper expectedWrapper = - StructLikeWrapper.forType(partitionType).set(partitionRecord); - - List dataFiles = Lists.newArrayList(); - try (CloseableIterable fileScanTasks = table.newScan().planFiles()) { - for (FileScanTask scanTask : fileScanTasks) { - StructLikeWrapper wrapper = - StructLikeWrapper.forType(partitionType).set(scanTask.file().partition()); - - if (expectedWrapper.equals(wrapper)) { - dataFiles.add(scanTask.file()); - } - } - } - - return dataFiles; - } - - public static Map> snapshotToDataFiles(Table table) throws IOException { - table.refresh(); - - Map> result = Maps.newHashMap(); - Snapshot current = table.currentSnapshot(); - while (current != null) { - TableScan tableScan = table.newScan(); - if (current.parentId() != null) { - // Collect the data files that was added only in current snapshot. - tableScan = tableScan.appendsBetween(current.parentId(), current.snapshotId()); - } else { - // Collect the data files that was added in the oldest snapshot. - tableScan = tableScan.useSnapshot(current.snapshotId()); - } - try (CloseableIterable scanTasks = tableScan.planFiles()) { - result.put( - current.snapshotId(), - ImmutableList.copyOf(Iterables.transform(scanTasks, FileScanTask::file))); - } - - // Continue to traverse the parent snapshot if exists. - if (current.parentId() == null) { - break; - } - // Iterate to the parent snapshot. - current = table.snapshot(current.parentId()); - } - return result; - } - - public static List matchingPartitions( - List dataFiles, PartitionSpec partitionSpec, Map partitionValues) { - Types.StructType partitionType = partitionSpec.partitionType(); - Record partitionRecord = GenericRecord.create(partitionType).copy(partitionValues); - StructLikeWrapper expected = StructLikeWrapper.forType(partitionType).set(partitionRecord); - return dataFiles.stream() - .filter( - df -> { - StructLikeWrapper wrapper = - StructLikeWrapper.forType(partitionType).set(df.partition()); - return wrapper.equals(expected); - }) - .collect(Collectors.toList()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java deleted file mode 100644 index 9411ea4f7d71..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.flink.FlinkCatalogFactory.DEFAULT_CATALOG_NAME; -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.List; -import java.util.Map; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -public abstract class SqlBase { - protected abstract TableEnvironment getTableEnv(); - - protected static TableResult exec(TableEnvironment env, String query, Object... args) { - return env.executeSql(String.format(query, args)); - } - - protected TableResult exec(String query, Object... args) { - return exec(getTableEnv(), query, args); - } - - protected List sql(String query, Object... args) { - TableResult tableResult = exec(query, args); - try (CloseableIterator iter = tableResult.collect()) { - return Lists.newArrayList(iter); - } catch (Exception e) { - throw new RuntimeException("Failed to collect table result", e); - } - } - - protected void assertSameElements(Iterable expected, Iterable actual) { - assertThat(actual).isNotNull().containsExactlyInAnyOrderElementsOf(expected); - } - - protected void assertSameElements(String message, Iterable expected, Iterable actual) { - assertThat(actual).isNotNull().as(message).containsExactlyInAnyOrderElementsOf(expected); - } - - /** - * We can not drop currently used catalog after FLINK-29677, so we have make sure that we do not - * use the current catalog before dropping it. This method switches to the 'default_catalog' and - * drops the one requested. - * - * @param catalogName The catalog to drop - * @param ifExists If we should use the 'IF EXISTS' when dropping the catalog - */ - protected void dropCatalog(String catalogName, boolean ifExists) { - sql("USE CATALOG %s", DEFAULT_CATALOG_NAME); - sql("DROP CATALOG %s %s", ifExists ? "IF EXISTS" : "", catalogName); - } - - /** - * We can not drop currently used database after FLINK-33226, so we have make sure that we do not - * use the current database before dropping it. This method switches to the default database in - * the default catalog, and then it and drops the one requested. - * - * @param database The database to drop - * @param ifExists If we should use the 'IF EXISTS' when dropping the database - */ - protected void dropDatabase(String database, boolean ifExists) { - String currentCatalog = getTableEnv().getCurrentCatalog(); - sql("USE CATALOG %s", DEFAULT_CATALOG_NAME); - sql("USE %s", getTableEnv().listDatabases()[0]); - sql("USE CATALOG %s", currentCatalog); - sql("DROP DATABASE %s %s", ifExists ? "IF EXISTS" : "", database); - } - - protected static String toWithClause(Map props) { - StringBuilder builder = new StringBuilder(); - builder.append("("); - int propCount = 0; - for (Map.Entry entry : props.entrySet()) { - if (propCount > 0) { - builder.append(","); - } - builder - .append("'") - .append(entry.getKey()) - .append("'") - .append("=") - .append("'") - .append(entry.getValue()) - .append("'"); - propCount++; - } - builder.append(")"); - return builder.toString(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestBase.java deleted file mode 100644 index 401960c3591b..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestBase.java +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.flink.FlinkCatalogFactory.DEFAULT_CATALOG_NAME; -import static org.assertj.core.api.Assertions.assertThat; - -import java.nio.file.Path; -import java.util.List; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.CatalogUtil; -import org.apache.iceberg.hive.HiveCatalog; -import org.apache.iceberg.hive.TestHiveMetastore; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -public abstract class TestBase extends SqlBase { - - @RegisterExtension - public static MiniClusterExtension miniClusterExtension = - MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); - - @TempDir protected Path temporaryDirectory; - - private static TestHiveMetastore metastore = null; - protected static HiveConf hiveConf = null; - protected static HiveCatalog catalog = null; - - private volatile TableEnvironment tEnv = null; - - @BeforeAll - public static void startMetastore() { - TestBase.metastore = new TestHiveMetastore(); - metastore.start(); - TestBase.hiveConf = metastore.hiveConf(); - TestBase.catalog = - (HiveCatalog) - CatalogUtil.loadCatalog( - HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); - } - - @AfterAll - public static void stopMetastore() throws Exception { - metastore.stop(); - TestBase.catalog = null; - } - - @Override - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build(); - - TableEnvironment env = TableEnvironment.create(settings); - env.getConfig() - .getConfiguration() - .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); - tEnv = env; - } - } - } - return tEnv; - } - - protected static TableResult exec(TableEnvironment env, String query, Object... args) { - return env.executeSql(String.format(query, args)); - } - - protected TableResult exec(String query, Object... args) { - return exec(getTableEnv(), query, args); - } - - protected List sql(String query, Object... args) { - TableResult tableResult = exec(query, args); - try (CloseableIterator iter = tableResult.collect()) { - return Lists.newArrayList(iter); - } catch (Exception e) { - throw new RuntimeException("Failed to collect table result", e); - } - } - - protected void assertSameElements(Iterable expected, Iterable actual) { - assertThat(actual).isNotNull().containsExactlyInAnyOrderElementsOf(expected); - } - - protected void assertSameElements(String message, Iterable expected, Iterable actual) { - assertThat(actual).isNotNull().as(message).containsExactlyInAnyOrderElementsOf(expected); - } - - /** - * We can not drop currently used catalog after FLINK-29677, so we have make sure that we do not - * use the current catalog before dropping it. This method switches to the 'default_catalog' and - * drops the one requested. - * - * @param catalogName The catalog to drop - * @param ifExists If we should use the 'IF EXISTS' when dropping the catalog - */ - protected void dropCatalog(String catalogName, boolean ifExists) { - sql("USE CATALOG %s", DEFAULT_CATALOG_NAME); - sql("DROP CATALOG %s %s", ifExists ? "IF EXISTS" : "", catalogName); - } - - /** - * We can not drop currently used database after FLINK-33226, so we have make sure that we do not - * use the current database before dropping it. This method switches to the default database in - * the default catalog, and then it and drops the one requested. - * - * @param database The database to drop - * @param ifExists If we should use the 'IF EXISTS' when dropping the database - */ - protected void dropDatabase(String database, boolean ifExists) { - String currentCatalog = getTableEnv().getCurrentCatalog(); - sql("USE CATALOG %s", DEFAULT_CATALOG_NAME); - sql("USE %s", getTableEnv().listDatabases()[0]); - sql("USE CATALOG %s", currentCatalog); - sql("DROP DATABASE %s %s", ifExists ? "IF EXISTS" : "", database); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java deleted file mode 100644 index e8f65921c19a..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.CatalogProperties.URI; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.entry; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.util.Map; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -/** Test for {@link CatalogLoader}. */ -public class TestCatalogLoader extends TestBase { - - private static File warehouse = null; - private static final TableIdentifier IDENTIFIER = TableIdentifier.of("default", "my_table"); - private static final Schema SCHEMA = - new Schema(Types.NestedField.required(1, "f1", Types.StringType.get())); - - @BeforeAll - public static void createWarehouse() throws IOException { - warehouse = File.createTempFile("warehouse", null); - assertThat(warehouse.delete()).isTrue(); - hiveConf.set("my_key", "my_value"); - } - - @AfterAll - public static void dropWarehouse() throws IOException { - if (warehouse != null && warehouse.exists()) { - Path warehousePath = new Path(warehouse.getAbsolutePath()); - FileSystem fs = warehousePath.getFileSystem(hiveConf); - assertThat(fs.delete(warehousePath, true)).as("Failed to delete " + warehousePath).isTrue(); - } - } - - @Test - public void testHadoopCatalogLoader() throws IOException, ClassNotFoundException { - Map properties = Maps.newHashMap(); - properties.put(CatalogProperties.WAREHOUSE_LOCATION, "file:" + warehouse); - CatalogLoader loader = CatalogLoader.hadoop("my_catalog", hiveConf, properties); - validateCatalogLoader(loader); - } - - @Test - public void testHiveCatalogLoader() throws IOException, ClassNotFoundException { - CatalogLoader loader = CatalogLoader.hive("my_catalog", hiveConf, Maps.newHashMap()); - validateCatalogLoader(loader); - } - - @Test - public void testRESTCatalogLoader() { - Map properties = Maps.newHashMap(); - properties.put(URI, "http://localhost/"); - CatalogLoader.rest("my_catalog", hiveConf, Maps.newHashMap()); - } - - private static void validateCatalogLoader(CatalogLoader loader) - throws IOException, ClassNotFoundException { - Table table = javaSerAndDeSer(loader).loadCatalog().createTable(IDENTIFIER, SCHEMA); - validateHadoopConf(table); - } - - private static void validateHadoopConf(Table table) { - FileIO io = table.io(); - assertThat(io).as("FileIO should be a HadoopFileIO").isInstanceOf(HadoopFileIO.class); - HadoopFileIO hadoopIO = (HadoopFileIO) io; - assertThat(hadoopIO.conf()).contains(entry("my_key", "my_value")); - } - - @SuppressWarnings("unchecked") - private static T javaSerAndDeSer(T object) throws IOException, ClassNotFoundException { - ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { - out.writeObject(object); - } - - try (ObjectInputStream in = - new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { - return (T) in.readObject(); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java deleted file mode 100644 index f719c7bc0001..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -/** Test for {@link TableLoader}. */ -public class TestCatalogTableLoader extends TestBase { - - private static File warehouse = null; - private static final TableIdentifier IDENTIFIER = TableIdentifier.of("default", "my_table"); - private static final Schema SCHEMA = - new Schema(Types.NestedField.required(1, "f1", Types.StringType.get())); - - @BeforeAll - public static void createWarehouse() throws IOException { - warehouse = File.createTempFile("warehouse", null); - assertThat(warehouse.delete()).isTrue(); - hiveConf.set("my_key", "my_value"); - } - - @AfterAll - public static void dropWarehouse() throws IOException { - if (warehouse != null && warehouse.exists()) { - Path warehousePath = new Path(warehouse.getAbsolutePath()); - FileSystem fs = warehousePath.getFileSystem(hiveConf); - assertThat(fs.delete(warehousePath, true)).as("Failed to delete " + warehousePath).isTrue(); - } - } - - @Test - public void testHadoopTableLoader() throws IOException, ClassNotFoundException { - String location = "file:" + warehouse + "/my_table"; - new HadoopTables(hiveConf).create(SCHEMA, location); - validateTableLoader(TableLoader.fromHadoopTable(location, hiveConf)); - } - - @Test - public void testHiveCatalogTableLoader() throws IOException, ClassNotFoundException { - CatalogLoader loader = CatalogLoader.hive("my_catalog", hiveConf, Maps.newHashMap()); - javaSerdes(loader).loadCatalog().createTable(IDENTIFIER, SCHEMA); - - CatalogLoader catalogLoader = CatalogLoader.hive("my_catalog", hiveConf, Maps.newHashMap()); - validateTableLoader(TableLoader.fromCatalog(catalogLoader, IDENTIFIER)); - } - - private static void validateTableLoader(TableLoader loader) - throws IOException, ClassNotFoundException { - TableLoader copied = javaSerdes(loader); - copied.open(); - try { - validateHadoopConf(copied.loadTable()); - } finally { - copied.close(); - } - } - - private static void validateHadoopConf(Table table) { - FileIO io = table.io(); - assertThat(io).as("FileIO should be a HadoopFileIO").isInstanceOf(HadoopFileIO.class); - HadoopFileIO hadoopIO = (HadoopFileIO) io; - assertThat(hadoopIO.conf().get("my_key")).isEqualTo("my_value"); - } - - @SuppressWarnings("unchecked") - private static T javaSerdes(T object) throws IOException, ClassNotFoundException { - ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { - out.writeObject(object); - } - - try (ObjectInputStream in = - new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { - return (T) in.readObject(); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java deleted file mode 100644 index 1997ef6998a2..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java +++ /dev/null @@ -1,296 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import org.apache.flink.types.Row; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.BaseTable; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.source.BoundedTableFactory; -import org.apache.iceberg.flink.source.ChangeLogTableTestBase; -import org.apache.iceberg.relocated.com.google.common.base.Joiner; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.StructLikeSet; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; - -/** - * In this test case, we mainly cover the impact of primary key selection, multiple operations - * within a single transaction, and multiple operations between different txn on the correctness of - * the data. - */ -@ExtendWith(ParameterizedTestExtension.class) -public class TestChangeLogTable extends ChangeLogTableTestBase { - private static final Configuration CONF = new Configuration(); - private static final String SOURCE_TABLE = "default_catalog.default_database.source_change_logs"; - - private static final String CATALOG_NAME = "test_catalog"; - private static final String DATABASE_NAME = "test_db"; - private static final String TABLE_NAME = "test_table"; - private String warehouse; - - @Parameter private boolean partitioned; - - @Parameters(name = "PartitionedTable={0}") - public static Iterable parameters() { - return ImmutableList.of(new Object[] {true}, new Object[] {false}); - } - - @BeforeEach - public void before() throws IOException { - File warehouseFile = File.createTempFile("junit", null, temporaryDirectory.toFile()); - assertThat(warehouseFile.delete()).isTrue(); - warehouse = String.format("file:%s", warehouseFile); - - sql( - "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", - CATALOG_NAME, warehouse); - sql("USE CATALOG %s", CATALOG_NAME); - sql("CREATE DATABASE %s", DATABASE_NAME); - sql("USE %s", DATABASE_NAME); - // Set the table.exec.sink.upsert-materialize=NONE, so that downstream operators will receive - // the - // records with the same order as the source operator, bypassing Flink's inferred shuffle. - getTableEnv().getConfig().set("table.exec.sink.upsert-materialize", "NONE"); - } - - @AfterEach - @Override - public void clean() { - sql("DROP TABLE IF EXISTS %s", TABLE_NAME); - dropDatabase(DATABASE_NAME, true); - dropCatalog(CATALOG_NAME, true); - BoundedTableFactory.clearDataSets(); - } - - @TestTemplate - public void testSqlChangeLogOnIdKey() throws Exception { - List> inputRowsPerCheckpoint = - ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(1, "bbb"), - insertRow(2, "aaa"), - deleteRow(2, "aaa"), - insertRow(2, "bbb")), - ImmutableList.of( - updateBeforeRow(2, "bbb"), - updateAfterRow(2, "ccc"), - deleteRow(2, "ccc"), - insertRow(2, "ddd")), - ImmutableList.of( - deleteRow(1, "bbb"), - insertRow(1, "ccc"), - deleteRow(1, "ccc"), - insertRow(1, "ddd"))); - - List> expectedRecordsPerCheckpoint = - ImmutableList.of( - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "bbb")), - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "ddd")), - ImmutableList.of(insertRow(1, "ddd"), insertRow(2, "ddd"))); - - testSqlChangeLog( - TABLE_NAME, ImmutableList.of("id"), inputRowsPerCheckpoint, expectedRecordsPerCheckpoint); - } - - @TestTemplate - public void testChangeLogOnDataKey() throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(1, "bbb"), - insertRow(2, "aaa")), - ImmutableList.of( - updateBeforeRow(2, "aaa"), updateAfterRow(1, "ccc"), insertRow(1, "aaa")), - ImmutableList.of(deleteRow(1, "bbb"), insertRow(2, "aaa"), insertRow(2, "ccc"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa")), - ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc")), - ImmutableList.of( - insertRow(1, "aaa"), - insertRow(1, "ccc"), - insertRow(2, "aaa"), - insertRow(2, "ccc"))); - - testSqlChangeLog(TABLE_NAME, ImmutableList.of("data"), elementsPerCheckpoint, expectedRecords); - } - - @TestTemplate - public void testChangeLogOnIdDataKey() throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(1, "bbb"), - insertRow(2, "aaa")), - ImmutableList.of( - updateBeforeRow(2, "aaa"), updateAfterRow(1, "ccc"), insertRow(1, "aaa")), - ImmutableList.of(deleteRow(1, "bbb"), insertRow(2, "aaa"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa"), insertRow(2, "bbb")), - ImmutableList.of( - insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc"), insertRow(2, "bbb")), - ImmutableList.of( - insertRow(1, "aaa"), - insertRow(1, "ccc"), - insertRow(2, "aaa"), - insertRow(2, "bbb"))); - - testSqlChangeLog( - TABLE_NAME, ImmutableList.of("data", "id"), elementsPerCheckpoint, expectedRecords); - } - - @TestTemplate - public void testPureInsertOnIdKey() throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of(insertRow(1, "aaa"), insertRow(2, "bbb")), - ImmutableList.of(insertRow(3, "ccc"), insertRow(4, "ddd")), - ImmutableList.of(insertRow(5, "eee"), insertRow(6, "fff"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(insertRow(1, "aaa"), insertRow(2, "bbb")), - ImmutableList.of( - insertRow(1, "aaa"), insertRow(2, "bbb"), insertRow(3, "ccc"), insertRow(4, "ddd")), - ImmutableList.of( - insertRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(3, "ccc"), - insertRow(4, "ddd"), - insertRow(5, "eee"), - insertRow(6, "fff"))); - - testSqlChangeLog(TABLE_NAME, ImmutableList.of("data"), elementsPerCheckpoint, expectedRecords); - } - - private static Record record(int id, String data) { - return SimpleDataUtil.createRecord(id, data); - } - - private Table createTable(String tableName, List key, boolean isPartitioned) { - String partitionByCause = isPartitioned ? "PARTITIONED BY (data)" : ""; - sql( - "CREATE TABLE %s(id INT, data VARCHAR, PRIMARY KEY(%s) NOT ENFORCED) %s", - tableName, Joiner.on(',').join(key), partitionByCause); - - // Upgrade the iceberg table to format v2. - CatalogLoader loader = - CatalogLoader.hadoop( - "my_catalog", CONF, ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouse)); - Table table = loader.loadCatalog().loadTable(TableIdentifier.of(DATABASE_NAME, TABLE_NAME)); - TableOperations ops = ((BaseTable) table).operations(); - TableMetadata meta = ops.current(); - ops.commit(meta, meta.upgradeToFormatVersion(2)); - - return table; - } - - private void testSqlChangeLog( - String tableName, - List key, - List> inputRowsPerCheckpoint, - List> expectedRecordsPerCheckpoint) - throws Exception { - String dataId = BoundedTableFactory.registerDataSet(inputRowsPerCheckpoint); - sql( - "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" - + " WITH ('connector'='BoundedSource', 'data-id'='%s')", - SOURCE_TABLE, dataId); - - assertThat(sql("SELECT * FROM %s", SOURCE_TABLE)).isEqualTo(listJoin(inputRowsPerCheckpoint)); - - Table table = createTable(tableName, key, partitioned); - sql("INSERT INTO %s SELECT * FROM %s", tableName, SOURCE_TABLE); - - table.refresh(); - List snapshots = findValidSnapshots(table); - int expectedSnapshotNum = expectedRecordsPerCheckpoint.size(); - assertThat(snapshots) - .as("Should have the expected snapshot number") - .hasSameSizeAs(expectedRecordsPerCheckpoint); - - for (int i = 0; i < expectedSnapshotNum; i++) { - long snapshotId = snapshots.get(i).snapshotId(); - List expectedRows = expectedRecordsPerCheckpoint.get(i); - assertThat(actualRowSet(table, snapshotId)) - .as("Should have the expected records for the checkpoint#" + i) - .isEqualTo(expectedRowSet(table, expectedRows)); - } - - if (expectedSnapshotNum > 0) { - assertThat(sql("SELECT * FROM %s", tableName)) - .as("Should have the expected rows in the final table") - .containsExactlyInAnyOrderElementsOf( - expectedRecordsPerCheckpoint.get(expectedSnapshotNum - 1)); - } - } - - private List findValidSnapshots(Table table) { - List validSnapshots = Lists.newArrayList(); - for (Snapshot snapshot : table.snapshots()) { - if (snapshot.allManifests(table.io()).stream() - .anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { - validSnapshots.add(snapshot); - } - } - return validSnapshots; - } - - private static StructLikeSet expectedRowSet(Table table, List rows) { - Record[] records = new Record[rows.size()]; - for (int i = 0; i < records.length; i++) { - records[i] = record((int) rows.get(i).getField(0), (String) rows.get(i).getField(1)); - } - return SimpleDataUtil.expectedRowSet(table, records); - } - - private static StructLikeSet actualRowSet(Table table, long snapshotId) throws IOException { - return SimpleDataUtil.actualRowSet(table, snapshotId, "*"); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java deleted file mode 100644 index 8992cbd75187..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.Map; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileMetadata; -import org.apache.iceberg.Metrics; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; - -public class TestDataFileSerialization { - - private static final Schema DATE_SCHEMA = - new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec PARTITION_SPEC = - PartitionSpec.builderFor(DATE_SCHEMA).identity("date").build(); - - private static final Map COLUMN_SIZES = Maps.newHashMap(); - private static final Map VALUE_COUNTS = Maps.newHashMap(); - private static final Map NULL_VALUE_COUNTS = Maps.newHashMap(); - private static final Map NAN_VALUE_COUNTS = Maps.newHashMap(); - private static final Map LOWER_BOUNDS = Maps.newHashMap(); - private static final Map UPPER_BOUNDS = Maps.newHashMap(); - - static { - COLUMN_SIZES.put(1, 2L); - COLUMN_SIZES.put(2, 3L); - VALUE_COUNTS.put(1, 5L); - VALUE_COUNTS.put(2, 3L); - VALUE_COUNTS.put(4, 2L); - NULL_VALUE_COUNTS.put(1, 0L); - NULL_VALUE_COUNTS.put(2, 2L); - NAN_VALUE_COUNTS.put(4, 1L); - LOWER_BOUNDS.put(1, longToBuffer(0L)); - UPPER_BOUNDS.put(1, longToBuffer(4L)); - } - - private static final Metrics METRICS = - new Metrics( - 5L, null, VALUE_COUNTS, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS); - - private static final DataFile DATA_FILE = - DataFiles.builder(PARTITION_SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(1234) - .withPartitionPath("date=2018-06-08") - .withMetrics(METRICS) - .withSplitOffsets(ImmutableList.of(4L)) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) - .withSortOrder(SortOrder.unsorted()) - .build(); - - private static final DeleteFile POS_DELETE_FILE = - FileMetadata.deleteFileBuilder(PARTITION_SPEC) - .ofPositionDeletes() - .withPath("/path/to/pos-delete.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("date=2018-06-08") - .withMetrics(METRICS) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) - .withRecordCount(23) - .build(); - - private static final DeleteFile EQ_DELETE_FILE = - FileMetadata.deleteFileBuilder(PARTITION_SPEC) - .ofEqualityDeletes(2, 3) - .withPath("/path/to/equality-delete.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("date=2018-06-08") - .withMetrics(METRICS) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) - .withRecordCount(23) - .withSortOrder(SortOrder.unsorted()) - .build(); - - @Test - public void testJavaSerialization() throws Exception { - ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { - out.writeObject(DATA_FILE); - out.writeObject(DATA_FILE.copy()); - - out.writeObject(POS_DELETE_FILE); - out.writeObject(POS_DELETE_FILE.copy()); - - out.writeObject(EQ_DELETE_FILE); - out.writeObject(EQ_DELETE_FILE.copy()); - } - - try (ObjectInputStream in = - new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { - for (int i = 0; i < 2; i += 1) { - Object obj = in.readObject(); - assertThat(obj).as("Should be a DataFile").isInstanceOf(DataFile.class); - TestHelpers.assertEquals(DATA_FILE, (DataFile) obj); - } - - for (int i = 0; i < 2; i += 1) { - Object obj = in.readObject(); - assertThat(obj).as("Should be a position DeleteFile").isInstanceOf(DeleteFile.class); - TestHelpers.assertEquals(POS_DELETE_FILE, (DeleteFile) obj); - } - - for (int i = 0; i < 2; i += 1) { - Object obj = in.readObject(); - assertThat(obj).as("Should be a equality DeleteFile").isInstanceOf(DeleteFile.class); - TestHelpers.assertEquals(EQ_DELETE_FILE, (DeleteFile) obj); - } - } - } - - @Test - public void testDataFileKryoSerialization() throws IOException { - KryoSerializer kryo = new KryoSerializer<>(DataFile.class, new ExecutionConfig()); - - DataOutputSerializer outputView = new DataOutputSerializer(1024); - - kryo.serialize(DATA_FILE, outputView); - kryo.serialize(DATA_FILE.copy(), outputView); - - DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); - DataFile dataFile1 = kryo.deserialize(inputView); - DataFile dataFile2 = kryo.deserialize(inputView); - - TestHelpers.assertEquals(DATA_FILE, dataFile1); - TestHelpers.assertEquals(DATA_FILE, dataFile2); - } - - @Test - public void testDeleteFileKryoSerialization() throws IOException { - KryoSerializer kryo = new KryoSerializer<>(DeleteFile.class, new ExecutionConfig()); - - DataOutputSerializer outputView = new DataOutputSerializer(1024); - - kryo.serialize(POS_DELETE_FILE, outputView); - kryo.serialize(POS_DELETE_FILE.copy(), outputView); - - kryo.serialize(EQ_DELETE_FILE, outputView); - kryo.serialize(EQ_DELETE_FILE.copy(), outputView); - - DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); - - DeleteFile posDeleteFile1 = kryo.deserialize(inputView); - DeleteFile posDeleteFile2 = kryo.deserialize(inputView); - - TestHelpers.assertEquals(POS_DELETE_FILE, posDeleteFile1); - TestHelpers.assertEquals(POS_DELETE_FILE, posDeleteFile2); - - DeleteFile eqDeleteFile1 = kryo.deserialize(inputView); - DeleteFile eqDeleteFile2 = kryo.deserialize(inputView); - - TestHelpers.assertEquals(EQ_DELETE_FILE, eqDeleteFile1); - TestHelpers.assertEquals(EQ_DELETE_FILE, eqDeleteFile2); - } - - private static ByteBuffer longToBuffer(long value) { - return ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN).putLong(0, value); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java deleted file mode 100644 index b9a7d5b1d589..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.types.Types.NestedField.required; - -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.types.Types; - -public class TestFixtures { - - private TestFixtures() {} - - public static final Schema SCHEMA = - new Schema( - required(1, "data", Types.StringType.get()), - required(2, "id", Types.LongType.get()), - required(3, "dt", Types.StringType.get())); - - public static final PartitionSpec SPEC = - PartitionSpec.builderFor(SCHEMA).identity("dt").bucket("id", 1).build(); - - public static final RowType ROW_TYPE = FlinkSchemaUtil.convert(SCHEMA); - - public static final String DATABASE = "default"; - public static final String TABLE = "t"; - public static final String SINK_TABLE = "t_sink"; - - public static final TableIdentifier TABLE_IDENTIFIER = TableIdentifier.of(DATABASE, TABLE); - public static final TableIdentifier SINK_TABLE_IDENTIFIER = - TableIdentifier.of(DATABASE, SINK_TABLE); - - public static final Schema TS_SCHEMA = - new Schema( - required(1, "ts", Types.TimestampType.withoutZone()), - required(2, "str", Types.StringType.get())); - - public static final PartitionSpec TS_SPEC = - PartitionSpec.builderFor(TS_SCHEMA).hour("ts").build(); - - public static final RowType TS_ROW_TYPE = FlinkSchemaUtil.convert(TS_SCHEMA); -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java deleted file mode 100644 index 70c8043f8fbb..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.nio.file.Files; -import java.util.concurrent.TimeUnit; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.Schema; -import org.apache.flink.table.api.Table; -import org.apache.flink.table.api.TableDescriptor; -import org.apache.flink.table.api.TableEnvironment; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.Test; - -public class TestFlinkAnonymousTable extends TestBase { - - @Test - public void testWriteAnonymousTable() throws Exception { - File warehouseDir = Files.createTempDirectory(temporaryDirectory, "junit").toFile(); - TableEnvironment tEnv = getTableEnv(); - Table table = - tEnv.from( - TableDescriptor.forConnector("datagen") - .schema(Schema.newBuilder().column("f0", DataTypes.STRING()).build()) - .option("number-of-rows", "3") - .build()); - - TableDescriptor descriptor = - TableDescriptor.forConnector("iceberg") - .schema(Schema.newBuilder().column("f0", DataTypes.STRING()).build()) - .option("catalog-name", "hadoop_test") - .option("catalog-type", "hadoop") - .option("catalog-database", "test_db") - .option("catalog-table", "test") - .option("warehouse", warehouseDir.getAbsolutePath()) - .build(); - - table.insertInto(descriptor).execute(); - Awaitility.await() - .atMost(3, TimeUnit.SECONDS) - .untilAsserted( - () -> - assertThat(warehouseDir.toPath().resolve("test_db").resolve("test").toFile()) - .exists()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java deleted file mode 100644 index bd07087756ad..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.nio.file.Path; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; -import org.apache.flink.types.Row; -import org.apache.iceberg.Schema; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.TestTemplate; - -public class TestFlinkCatalogDatabase extends CatalogTestBase { - - @AfterEach - @Override - public void clean() { - sql("DROP TABLE IF EXISTS %s.tl", flinkDatabase); - dropDatabase(flinkDatabase, true); - super.clean(); - } - - @TestTemplate - public void testCreateNamespace() { - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Database should not already exist") - .isFalse(); - - sql("CREATE DATABASE %s", flinkDatabase); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Database should exist") - .isTrue(); - - sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Database should still exist") - .isTrue(); - - dropDatabase(flinkDatabase, true); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Database should be dropped") - .isFalse(); - - sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Database should be created") - .isTrue(); - } - - @TestTemplate - public void testDropEmptyDatabase() { - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should not already exist") - .isFalse(); - sql("CREATE DATABASE %s", flinkDatabase); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should exist") - .isTrue(); - dropDatabase(flinkDatabase, true); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should have been dropped") - .isFalse(); - } - - @TestTemplate - public void testDropNonEmptyNamespace() { - assumeThat(isHadoopCatalog) - .as("Hadoop catalog throws IOException: Directory is not empty.") - .isFalse(); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should not already exist") - .isFalse(); - sql("CREATE DATABASE %s", flinkDatabase); - validationCatalog.createTable( - TableIdentifier.of(icebergNamespace, "tl"), - new Schema(Types.NestedField.optional(0, "id", Types.LongType.get()))); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should exist") - .isTrue(); - assertThat(validationCatalog.tableExists(TableIdentifier.of(icebergNamespace, "tl"))) - .as("Table should exist") - .isTrue(); - assertThatThrownBy(() -> dropDatabase(flinkDatabase, true)) - .cause() - .isInstanceOf(DatabaseNotEmptyException.class) - .hasMessage( - String.format("Database %s in catalog %s is not empty.", DATABASE, catalogName)); - sql("DROP TABLE %s.tl", flinkDatabase); - } - - @TestTemplate - public void testListTables() { - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should not already exist") - .isFalse(); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should exist") - .isTrue(); - assertThat(sql("SHOW TABLES")).isEmpty(); - validationCatalog.createTable( - TableIdentifier.of(icebergNamespace, "tl"), - new Schema(Types.NestedField.optional(0, "id", Types.LongType.get()))); - - List tables = sql("SHOW TABLES"); - assertThat(tables).hasSize(1); - assertThat("tl").as("Table name should match").isEqualTo(tables.get(0).getField(0)); - } - - @TestTemplate - public void testListNamespace() { - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should not already exist") - .isFalse(); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should exist") - .isTrue(); - - List databases = sql("SHOW DATABASES"); - - if (isHadoopCatalog) { - assertThat(databases).hasSize(1); - assertThat(databases.get(0).getField(0)).as("Should have db database").isEqualTo("db"); - if (!baseNamespace.isEmpty()) { - // test namespace not belongs to this catalog - validationNamespaceCatalog.createNamespace( - Namespace.of(baseNamespace.level(0), "UNKNOWN_NAMESPACE")); - databases = sql("SHOW DATABASES"); - assertThat(databases).hasSize(1); - assertThat(databases.get(0).getField(0)).as("Should have db database").isEqualTo("db"); - } - } else { - // If there are multiple classes extends FlinkTestBase, TestHiveMetastore may loose the - // creation for default - // database. See HiveMetaStore.HMSHandler.init. - assertThat(databases) - .as("Should have db database") - .anyMatch(d -> Objects.equals(d.getField(0), "db")); - } - } - - @TestTemplate - public void testCreateNamespaceWithMetadata() { - assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isFalse(); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should not already exist") - .isFalse(); - sql("CREATE DATABASE %s WITH ('prop'='value')", flinkDatabase); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should exist") - .isTrue(); - Map nsMetadata = - validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - assertThat(nsMetadata).containsEntry("prop", "value"); - } - - @TestTemplate - public void testCreateNamespaceWithComment() { - assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isFalse(); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should not already exist") - .isFalse(); - - sql("CREATE DATABASE %s COMMENT 'namespace doc'", flinkDatabase); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should exist") - .isTrue(); - Map nsMetadata = - validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - assertThat(nsMetadata).containsEntry("comment", "namespace doc"); - } - - @TestTemplate - public void testCreateNamespaceWithLocation() throws Exception { - assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isFalse(); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should not already exist") - .isFalse(); - - Path location = temporaryDirectory.getRoot(); - sql("CREATE DATABASE %s WITH ('location'='%s')", flinkDatabase, location); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should exist") - .isTrue(); - Map nsMetadata = - validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - assertThat(nsMetadata).containsEntry("location", "file:" + location.getRoot()); - } - - @TestTemplate - public void testSetProperties() { - assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isFalse(); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should not already exist") - .isFalse(); - - sql("CREATE DATABASE %s", flinkDatabase); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should exist") - .isTrue(); - - Map defaultMetadata = - validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - assertThat(defaultMetadata).doesNotContainKey("prop"); - sql("ALTER DATABASE %s SET ('prop'='value')", flinkDatabase); - Map nsMetadata = - validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - assertThat(nsMetadata).containsEntry("prop", "value"); - } - - @TestTemplate - public void testHadoopNotSupportMeta() { - assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isTrue(); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should not already exist") - .isFalse(); - assertThatThrownBy(() -> sql("CREATE DATABASE %s WITH ('prop'='value')", flinkDatabase)) - .cause() - .isInstanceOf(UnsupportedOperationException.class) - .hasMessage( - String.format( - "Cannot create namespace %s: metadata is not supported", icebergNamespace)); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java deleted file mode 100644 index 4c9e95b8fa82..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.hadoop.HadoopCatalog; -import org.apache.iceberg.hive.HiveCatalog; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -public class TestFlinkCatalogFactory { - - private Map props; - - @BeforeEach - public void before() { - props = Maps.newHashMap(); - props.put("type", "iceberg"); - props.put(CatalogProperties.WAREHOUSE_LOCATION, "/tmp/location"); - } - - @Test - public void testCreateCatalogHive() { - String catalogName = "hiveCatalog"; - props.put( - FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); - - Catalog catalog = - FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) - .loadCatalog(); - - assertThat(catalog).isNotNull().isInstanceOf(HiveCatalog.class); - } - - @Test - public void testCreateCatalogHadoop() { - String catalogName = "hadoopCatalog"; - props.put( - FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HADOOP); - - Catalog catalog = - FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) - .loadCatalog(); - - assertThat(catalog).isNotNull().isInstanceOf(HadoopCatalog.class); - } - - @Test - public void testCreateCatalogCustom() { - String catalogName = "customCatalog"; - props.put(CatalogProperties.CATALOG_IMPL, CustomHadoopCatalog.class.getName()); - - Catalog catalog = - FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) - .loadCatalog(); - - assertThat(catalog).isNotNull().isInstanceOf(CustomHadoopCatalog.class); - } - - @Test - public void testCreateCatalogCustomWithHiveCatalogTypeSet() { - String catalogName = "customCatalog"; - props.put(CatalogProperties.CATALOG_IMPL, CustomHadoopCatalog.class.getName()); - props.put( - FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); - - assertThatThrownBy( - () -> FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration())) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageStartingWith( - "Cannot create catalog customCatalog, both catalog-type and catalog-impl are set"); - } - - @Test - public void testLoadCatalogUnknown() { - String catalogName = "unknownCatalog"; - props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "fooType"); - - assertThatThrownBy( - () -> FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration())) - .isInstanceOf(UnsupportedOperationException.class) - .hasMessageStartingWith("Unknown catalog-type: fooType"); - } - - public static class CustomHadoopCatalog extends HadoopCatalog { - - public CustomHadoopCatalog() {} - - public CustomHadoopCatalog(Configuration conf, String warehouseLocation) { - setConf(conf); - initialize( - "custom", ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation)); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java deleted file mode 100644 index 2cfee87e4631..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java +++ /dev/null @@ -1,715 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.util.Arrays; -import java.util.Collections; -import java.util.Map; -import java.util.Optional; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.StreamSupport; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.Schema.UnresolvedPrimaryKey; -import org.apache.flink.table.api.TableException; -import org.apache.flink.table.api.ValidationException; -import org.apache.flink.table.catalog.CatalogTable; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.catalog.exceptions.TableNotExistException; -import org.apache.iceberg.BaseTable; -import org.apache.iceberg.ContentFile; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.DataOperations; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.exceptions.NoSuchTableException; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; - -public class TestFlinkCatalogTable extends CatalogTestBase { - - @Override - @BeforeEach - public void before() { - super.before(); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - } - - @AfterEach - public void cleanNamespaces() { - sql("DROP TABLE IF EXISTS %s.tl", flinkDatabase); - sql("DROP TABLE IF EXISTS %s.tl2", flinkDatabase); - dropDatabase(flinkDatabase, true); - super.clean(); - } - - @TestTemplate - public void testGetTable() { - sql("CREATE TABLE tl(id BIGINT, strV STRING)"); - - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, "tl")); - Schema iSchema = - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "strV", Types.StringType.get())); - assertThat(table.schema().toString()) - .as("Should load the expected iceberg schema") - .isEqualTo(iSchema.toString()); - } - - @TestTemplate - public void testRenameTable() { - assumeThat(isHadoopCatalog).as("HadoopCatalog does not support rename table").isFalse(); - final Schema tableSchema = - new Schema(Types.NestedField.optional(0, "id", Types.LongType.get())); - validationCatalog.createTable(TableIdentifier.of(icebergNamespace, "tl"), tableSchema); - sql("ALTER TABLE tl RENAME TO tl2"); - - assertThatThrownBy(() -> getTableEnv().from("tl")) - .isInstanceOf(ValidationException.class) - .hasMessage("Table `tl` was not found."); - - Schema actualSchema = FlinkSchemaUtil.convert(getTableEnv().from("tl2").getResolvedSchema()); - assertThat(tableSchema.asStruct()).isEqualTo(actualSchema.asStruct()); - } - - @TestTemplate - public void testCreateTable() throws TableNotExistException { - sql("CREATE TABLE tl(id BIGINT)"); - - Table table = table("tl"); - assertThat(table.schema().asStruct()) - .isEqualTo( - new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); - CatalogTable catalogTable = catalogTable("tl"); - assertThat(catalogTable.getUnresolvedSchema()) - .isEqualTo( - org.apache.flink.table.api.Schema.newBuilder() - .column("id", DataTypes.BIGINT()) - .build()); - } - - @TestTemplate - public void testCreateTableWithPrimaryKey() throws Exception { - sql("CREATE TABLE tl(id BIGINT, data STRING, key STRING PRIMARY KEY NOT ENFORCED)"); - - Table table = table("tl"); - assertThat(table.schema().identifierFieldIds()) - .as("Should have the expected row key.") - .isEqualTo(Sets.newHashSet(table.schema().findField("key").fieldId())); - CatalogTable catalogTable = catalogTable("tl"); - Optional uniqueConstraintOptional = - catalogTable.getUnresolvedSchema().getPrimaryKey(); - assertThat(uniqueConstraintOptional).isPresent(); - assertThat(uniqueConstraintOptional.get().getColumnNames()).containsExactly("key"); - } - - @TestTemplate - public void testCreateTableWithMultiColumnsInPrimaryKey() throws Exception { - sql( - "CREATE TABLE tl(id BIGINT, data STRING, CONSTRAINT pk_constraint PRIMARY KEY(data, id) NOT ENFORCED)"); - - Table table = table("tl"); - assertThat(table.schema().identifierFieldIds()) - .as("Should have the expected RowKey") - .isEqualTo( - Sets.newHashSet( - table.schema().findField("id").fieldId(), - table.schema().findField("data").fieldId())); - CatalogTable catalogTable = catalogTable("tl"); - Optional uniqueConstraintOptional = - catalogTable.getUnresolvedSchema().getPrimaryKey(); - assertThat(uniqueConstraintOptional).isPresent(); - assertThat(uniqueConstraintOptional.get().getColumnNames()).containsExactly("id", "data"); - } - - @TestTemplate - public void testCreateTableIfNotExists() { - sql("CREATE TABLE tl(id BIGINT)"); - - // Assert that table does exist. - assertThat(table("tl")).isNotNull(); - - sql("DROP TABLE tl"); - assertThatThrownBy(() -> table("tl")) - .isInstanceOf(NoSuchTableException.class) - .hasMessage("Table does not exist: " + getFullQualifiedTableName("tl")); - - sql("CREATE TABLE IF NOT EXISTS tl(id BIGINT)"); - assertThat(table("tl").properties()).doesNotContainKey("key"); - - table("tl").updateProperties().set("key", "value").commit(); - assertThat(table("tl").properties()).containsEntry("key", "value"); - - sql("CREATE TABLE IF NOT EXISTS tl(id BIGINT)"); - assertThat(table("tl").properties()).containsEntry("key", "value"); - } - - @TestTemplate - public void testCreateTableLike() throws TableNotExistException { - sql("CREATE TABLE tl(id BIGINT)"); - sql("CREATE TABLE tl2 LIKE tl"); - - Table table = table("tl2"); - assertThat(table.schema().asStruct()) - .isEqualTo( - new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); - CatalogTable catalogTable = catalogTable("tl2"); - assertThat(catalogTable.getUnresolvedSchema()) - .isEqualTo( - org.apache.flink.table.api.Schema.newBuilder() - .column("id", DataTypes.BIGINT()) - .build()); - } - - @TestTemplate - public void testCreateTableLikeInDiffIcebergCatalog() throws TableNotExistException { - sql("CREATE TABLE tl(id BIGINT)"); - - String catalog2 = catalogName + "2"; - sql("CREATE CATALOG %s WITH %s", catalog2, toWithClause(config)); - sql("CREATE DATABASE %s", catalog2 + ".testdb"); - sql("CREATE TABLE %s LIKE tl", catalog2 + ".testdb.tl2"); - - CatalogTable catalogTable = catalogTable(catalog2, "testdb", "tl2"); - assertThat(catalogTable.getUnresolvedSchema()) - .isEqualTo( - org.apache.flink.table.api.Schema.newBuilder() - .column("id", DataTypes.BIGINT()) - .build()); - - dropCatalog(catalog2, true); - } - - @TestTemplate - public void testCreateTableLikeInFlinkCatalog() throws TableNotExistException { - sql("CREATE TABLE tl(id BIGINT)"); - - sql("CREATE TABLE `default_catalog`.`default_database`.tl2 LIKE tl"); - - CatalogTable catalogTable = catalogTable("default_catalog", "default_database", "tl2"); - assertThat(catalogTable.getUnresolvedSchema()) - .isEqualTo( - org.apache.flink.table.api.Schema.newBuilder() - .column("id", DataTypes.BIGINT()) - .build()); - - String srcCatalogProps = FlinkCreateTableOptions.toJson(catalogName, DATABASE, "tl", config); - Map options = catalogTable.getOptions(); - assertThat(options) - .containsEntry( - FlinkCreateTableOptions.CONNECTOR_PROPS_KEY, - FlinkDynamicTableFactory.FACTORY_IDENTIFIER) - .containsEntry(FlinkCreateTableOptions.SRC_CATALOG_PROPS_KEY, srcCatalogProps); - } - - @TestTemplate - public void testCreateTableLocation() { - assumeThat(isHadoopCatalog) - .as("HadoopCatalog does not support creating table with location") - .isFalse(); - sql("CREATE TABLE tl(id BIGINT) WITH ('location'='file:///tmp/location')"); - - Table table = table("tl"); - assertThat(table.schema().asStruct()) - .isEqualTo( - new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); - assertThat(table.location()).isEqualTo("file:///tmp/location"); - } - - @TestTemplate - public void testCreatePartitionTable() throws TableNotExistException { - sql("CREATE TABLE tl(id BIGINT, dt STRING) PARTITIONED BY(dt)"); - - Table table = table("tl"); - assertThat(table.schema().asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - assertThat(table.spec()) - .isEqualTo(PartitionSpec.builderFor(table.schema()).identity("dt").build()); - CatalogTable catalogTable = catalogTable("tl"); - assertThat(catalogTable.getUnresolvedSchema()) - .isEqualTo( - org.apache.flink.table.api.Schema.newBuilder() - .column("id", DataTypes.BIGINT()) - .column("dt", DataTypes.STRING()) - .build()); - assertThat(catalogTable.getPartitionKeys()).isEqualTo(Collections.singletonList("dt")); - } - - @TestTemplate - public void testCreateTableWithColumnComment() { - sql("CREATE TABLE tl(id BIGINT COMMENT 'comment - id', data STRING COMMENT 'comment - data')"); - - Table table = table("tl"); - assertThat(table.schema().asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get(), "comment - id"), - Types.NestedField.optional(2, "data", Types.StringType.get(), "comment - data")) - .asStruct()); - } - - @TestTemplate - public void testCreateTableWithFormatV2ThroughTableProperty() { - sql("CREATE TABLE tl(id BIGINT) WITH ('format-version'='2')"); - - Table table = table("tl"); - assertThat(((BaseTable) table).operations().current().formatVersion()).isEqualTo(2); - } - - @TestTemplate - public void testUpgradeTableWithFormatV2ThroughTableProperty() { - sql("CREATE TABLE tl(id BIGINT) WITH ('format-version'='1')"); - - Table table = table("tl"); - TableOperations ops = ((BaseTable) table).operations(); - assertThat(ops.refresh().formatVersion()) - .as("should create table using format v1") - .isEqualTo(1); - sql("ALTER TABLE tl SET('format-version'='2')"); - assertThat(ops.refresh().formatVersion()) - .as("should update table to use format v2") - .isEqualTo(2); - } - - @TestTemplate - public void testDowngradeTableToFormatV1ThroughTablePropertyFails() { - sql("CREATE TABLE tl(id BIGINT) WITH ('format-version'='2')"); - - Table table = table("tl"); - TableOperations ops = ((BaseTable) table).operations(); - assertThat(ops.refresh().formatVersion()) - .as("should create table using format v2") - .isEqualTo(2); - assertThatThrownBy(() -> sql("ALTER TABLE tl SET('format-version'='1')")) - .rootCause() - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot downgrade v2 table to v1"); - } - - @TestTemplate - public void testLoadTransformPartitionTable() throws TableNotExistException { - Schema schema = new Schema(Types.NestedField.optional(0, "id", Types.LongType.get())); - validationCatalog.createTable( - TableIdentifier.of(icebergNamespace, "tl"), - schema, - PartitionSpec.builderFor(schema).bucket("id", 100).build()); - - CatalogTable catalogTable = catalogTable("tl"); - assertThat(catalogTable.getUnresolvedSchema()) - .isEqualTo( - org.apache.flink.table.api.Schema.newBuilder() - .column("id", DataTypes.BIGINT()) - .build()); - assertThat(catalogTable.getPartitionKeys()).isEmpty(); - } - - @TestTemplate - public void testAlterTableProperties() { - sql("CREATE TABLE tl(id BIGINT) WITH ('oldK'='oldV')"); - Map properties = Maps.newHashMap(); - properties.put("oldK", "oldV"); - - // new - sql("ALTER TABLE tl SET('newK'='newV')"); - properties.put("newK", "newV"); - assertThat(table("tl").properties()).containsAllEntriesOf(properties); - - // update old - sql("ALTER TABLE tl SET('oldK'='oldV2')"); - properties.put("oldK", "oldV2"); - assertThat(table("tl").properties()).containsAllEntriesOf(properties); - - // remove property - sql("ALTER TABLE tl RESET('oldK')"); - properties.remove("oldK"); - assertThat(table("tl").properties()).containsAllEntriesOf(properties); - } - - @TestTemplate - public void testAlterTableAddColumn() { - sql("CREATE TABLE tl(id BIGINT)"); - Schema schemaBefore = table("tl").schema(); - assertThat(schemaBefore.asStruct()) - .isEqualTo( - new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); - sql("ALTER TABLE tl ADD (dt STRING)"); - Schema schemaAfter1 = table("tl").schema(); - assertThat(schemaAfter1.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - // Add multiple columns - sql("ALTER TABLE tl ADD (col1 STRING COMMENT 'comment for col1', col2 BIGINT)"); - Schema schemaAfter2 = table("tl").schema(); - assertThat(schemaAfter2.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get()), - Types.NestedField.optional( - 3, "col1", Types.StringType.get(), "comment for col1"), - Types.NestedField.optional(4, "col2", Types.LongType.get())) - .asStruct()); - - // Adding an existing field should fail due to Flink's internal validation. - assertThatThrownBy(() -> sql("ALTER TABLE tl ADD (id STRING)")) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Try to add a column `id` which already exists in the table."); - } - - @TestTemplate - public void testAlterTableDropColumn() { - sql("CREATE TABLE tl(id BIGINT, dt STRING, col1 STRING, col2 BIGINT)"); - Schema schemaBefore = table("tl").schema(); - assertThat(schemaBefore.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get()), - Types.NestedField.optional(3, "col1", Types.StringType.get()), - Types.NestedField.optional(4, "col2", Types.LongType.get())) - .asStruct()); - sql("ALTER TABLE tl DROP (dt)"); - Schema schemaAfter1 = table("tl").schema(); - assertThat(schemaAfter1.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(3, "col1", Types.StringType.get()), - Types.NestedField.optional(4, "col2", Types.LongType.get())) - .asStruct()); - // Drop multiple columns - sql("ALTER TABLE tl DROP (col1, col2)"); - Schema schemaAfter2 = table("tl").schema(); - assertThat(schemaAfter2.asStruct()) - .isEqualTo( - new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); - // Dropping an non-existing field should fail due to Flink's internal validation. - assertThatThrownBy(() -> sql("ALTER TABLE tl DROP (foo)")) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("The column `foo` does not exist in the base table."); - - // Dropping an already-deleted field should fail due to Flink's internal validation. - assertThatThrownBy(() -> sql("ALTER TABLE tl DROP (dt)")) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("The column `dt` does not exist in the base table."); - } - - @TestTemplate - public void testAlterTableModifyColumnName() { - sql("CREATE TABLE tl(id BIGINT, dt STRING)"); - Schema schemaBefore = table("tl").schema(); - assertThat(schemaBefore.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - sql("ALTER TABLE tl RENAME dt TO data"); - Schema schemaAfter = table("tl").schema(); - assertThat(schemaAfter.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())) - .asStruct()); - } - - @TestTemplate - public void testAlterTableModifyColumnType() { - sql("CREATE TABLE tl(id INTEGER, dt STRING)"); - Schema schemaBefore = table("tl").schema(); - assertThat(schemaBefore.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - // Promote type from Integer to Long - sql("ALTER TABLE tl MODIFY (id BIGINT)"); - Schema schemaAfter = table("tl").schema(); - assertThat(schemaAfter.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - // Type change that doesn't follow the type-promotion rule should fail due to Iceberg's - // validation. - assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (dt INTEGER)")) - .isInstanceOf(TableException.class) - .hasMessageContaining("Could not execute AlterTable") - .hasRootCauseInstanceOf(IllegalArgumentException.class) - .hasRootCauseMessage("Cannot change column type: dt: string -> int"); - } - - @TestTemplate - public void testAlterTableModifyColumnNullability() { - sql("CREATE TABLE tl(id INTEGER NOT NULL, dt STRING)"); - Schema schemaBefore = table("tl").schema(); - assertThat(schemaBefore.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - - // Set nullability from required to optional - sql("ALTER TABLE tl MODIFY (id INTEGER)"); - Schema schemaAfter = table("tl").schema(); - assertThat(schemaAfter.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - } - - @TestTemplate - public void testAlterTableModifyColumnPosition() { - sql("CREATE TABLE tl(id BIGINT, dt STRING)"); - Schema schemaBefore = table("tl").schema(); - assertThat(schemaBefore.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - - sql("ALTER TABLE tl MODIFY (dt STRING FIRST)"); - Schema schemaAfter = table("tl").schema(); - assertThat(schemaAfter.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(2, "dt", Types.StringType.get()), - Types.NestedField.optional(1, "id", Types.LongType.get())) - .asStruct()); - - sql("ALTER TABLE tl MODIFY (dt STRING AFTER id)"); - Schema schemaAfterAfter = table("tl").schema(); - assertThat(schemaAfterAfter.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - // Modifying the position of a non-existing column should fail due to Flink's internal - // validation. - assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (non_existing STRING FIRST)")) - .isInstanceOf(ValidationException.class) - .hasMessageContaining( - "Try to modify a column `non_existing` which does not exist in the table."); - - // Moving a column after a non-existing column should fail due to Flink's internal validation. - assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (dt STRING AFTER non_existing)")) - .isInstanceOf(ValidationException.class) - .hasMessageContaining( - "Referenced column `non_existing` by 'AFTER' does not exist in the table."); - } - - @TestTemplate - public void testAlterTableModifyColumnComment() { - sql("CREATE TABLE tl(id BIGINT, dt STRING)"); - Schema schemaBefore = table("tl").schema(); - assertThat(schemaBefore.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - - sql("ALTER TABLE tl MODIFY (dt STRING COMMENT 'comment for dt field')"); - Schema schemaAfter = table("tl").schema(); - assertThat(schemaAfter.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional( - 2, "dt", Types.StringType.get(), "comment for dt field")) - .asStruct()); - } - - @TestTemplate - public void testAlterTableConstraint() { - sql("CREATE TABLE tl(id BIGINT NOT NULL, dt STRING NOT NULL, col1 STRING)"); - Schema schemaBefore = table("tl").schema(); - assertThat(schemaBefore.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.required(2, "dt", Types.StringType.get()), - Types.NestedField.optional(3, "col1", Types.StringType.get())) - .asStruct()); - assertThat(schemaBefore.identifierFieldNames()).isEmpty(); - sql("ALTER TABLE tl ADD (PRIMARY KEY (id) NOT ENFORCED)"); - Schema schemaAfterAdd = table("tl").schema(); - assertThat(schemaAfterAdd.identifierFieldNames()).containsExactly("id"); - sql("ALTER TABLE tl MODIFY (PRIMARY KEY (dt) NOT ENFORCED)"); - Schema schemaAfterModify = table("tl").schema(); - assertThat(schemaAfterModify.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.required(2, "dt", Types.StringType.get()), - Types.NestedField.optional(3, "col1", Types.StringType.get())) - .asStruct()); - assertThat(schemaAfterModify.identifierFieldNames()).containsExactly("dt"); - // Composite primary key - sql("ALTER TABLE tl MODIFY (PRIMARY KEY (id, dt) NOT ENFORCED)"); - Schema schemaAfterComposite = table("tl").schema(); - assertThat(schemaAfterComposite.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.required(2, "dt", Types.StringType.get()), - Types.NestedField.optional(3, "col1", Types.StringType.get())) - .asStruct()); - assertThat(schemaAfterComposite.identifierFieldNames()).containsExactlyInAnyOrder("id", "dt"); - // Setting an optional field as primary key should fail - // because Iceberg's SchemaUpdate does not allow incompatible changes. - assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (PRIMARY KEY (col1) NOT ENFORCED)")) - .isInstanceOf(TableException.class) - .hasMessageContaining("Could not execute AlterTable") - .hasRootCauseInstanceOf(IllegalArgumentException.class) - .hasRootCauseMessage("Cannot add field col1 as an identifier field: not a required field"); - - // Setting a composite key containing an optional field should fail - // because Iceberg's SchemaUpdate does not allow incompatible changes. - assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (PRIMARY KEY (id, col1) NOT ENFORCED)")) - .isInstanceOf(TableException.class) - .hasMessageContaining("Could not execute AlterTable") - .hasRootCauseInstanceOf(IllegalArgumentException.class) - .hasRootCauseMessage("Cannot add field col1 as an identifier field: not a required field"); - - // Dropping constraints is not supported yet - assertThatThrownBy(() -> sql("ALTER TABLE tl DROP PRIMARY KEY")) - .isInstanceOf(TableException.class) - .hasMessageContaining("Could not execute AlterTable") - .hasRootCauseInstanceOf(UnsupportedOperationException.class) - .hasRootCauseMessage("Unsupported table change: DropConstraint."); - } - - @TestTemplate - public void testRelocateTable() { - assumeThat(isHadoopCatalog).as("HadoopCatalog does not support relocate table").isFalse(); - sql("CREATE TABLE tl(id BIGINT)"); - sql("ALTER TABLE tl SET('location'='file:///tmp/location')"); - assertThat(table("tl").location()).isEqualTo("file:///tmp/location"); - } - - @TestTemplate - public void testSetCurrentAndCherryPickSnapshotId() { - sql("CREATE TABLE tl(c1 INT, c2 STRING, c3 STRING) PARTITIONED BY (c1)"); - - Table table = table("tl"); - - DataFile fileA = - DataFiles.builder(table.spec()) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - DataFile fileB = - DataFiles.builder(table.spec()) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=1") // easy way to set partition data for now - .withRecordCount(1) - .build(); - DataFile replacementFile = - DataFiles.builder(table.spec()) - .withPath("/path/to/data-a-replacement.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - - table.newAppend().appendFile(fileA).commit(); - long snapshotId = table.currentSnapshot().snapshotId(); - - // stage an overwrite that replaces FILE_A - table.newReplacePartitions().addFile(replacementFile).stageOnly().commit(); - - Snapshot staged = Iterables.getLast(table.snapshots()); - assertThat(staged.operation()) - .as("Should find the staged overwrite snapshot") - .isEqualTo(DataOperations.OVERWRITE); - // add another append so that the original commit can't be fast-forwarded - table.newAppend().appendFile(fileB).commit(); - - // test cherry pick - sql("ALTER TABLE tl SET('cherry-pick-snapshot-id'='%s')", staged.snapshotId()); - validateTableFiles(table, fileB, replacementFile); - - // test set current snapshot - sql("ALTER TABLE tl SET('current-snapshot-id'='%s')", snapshotId); - validateTableFiles(table, fileA); - } - - private void validateTableFiles(Table tbl, DataFile... expectedFiles) { - tbl.refresh(); - Set expectedFilePaths = - Arrays.stream(expectedFiles).map(DataFile::location).collect(Collectors.toSet()); - Set actualFilePaths = - StreamSupport.stream(tbl.newScan().planFiles().spliterator(), false) - .map(FileScanTask::file) - .map(ContentFile::location) - .collect(Collectors.toSet()); - assertThat(actualFilePaths).as("Files should match").isEqualTo(expectedFilePaths); - } - - private Table table(String name) { - return validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, name)); - } - - private CatalogTable catalogTable(String name) throws TableNotExistException { - return catalogTable(getTableEnv().getCurrentCatalog(), DATABASE, name); - } - - private CatalogTable catalogTable(String catalog, String database, String table) - throws TableNotExistException { - return (CatalogTable) - getTableEnv().getCatalog(catalog).get().getTable(new ObjectPath(database, table)); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java deleted file mode 100644 index e69e1ac4d713..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.util.List; -import org.apache.flink.table.catalog.CatalogPartitionSpec; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.catalog.exceptions.TableNotExistException; -import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; - -public class TestFlinkCatalogTablePartitions extends CatalogTestBase { - - private final String tableName = "test_table"; - - @Parameter(index = 2) - private FileFormat format; - - @Parameter(index = 3) - private Boolean cacheEnabled; - - @Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, cacheEnabled={3}") - protected static List parameters() { - List parameters = Lists.newArrayList(); - for (FileFormat format : - new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { - for (Boolean cacheEnabled : new Boolean[] {true, false}) { - for (Object[] catalogParams : CatalogTestBase.parameters()) { - String catalogName = (String) catalogParams[0]; - Namespace baseNamespace = (Namespace) catalogParams[1]; - parameters.add(new Object[] {catalogName, baseNamespace, format, cacheEnabled}); - } - } - } - return parameters; - } - - @Override - @BeforeEach - public void before() { - super.before(); - config.put(CatalogProperties.CACHE_ENABLED, String.valueOf(cacheEnabled)); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - } - - @AfterEach - public void cleanNamespaces() { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - dropDatabase(flinkDatabase, true); - super.clean(); - } - - @TestTemplate - public void testListPartitionsWithUnpartitionedTable() { - sql( - "CREATE TABLE %s (id INT, data VARCHAR) with ('write.format.default'='%s')", - tableName, format.name()); - sql("INSERT INTO %s SELECT 1,'a'", tableName); - - ObjectPath objectPath = new ObjectPath(DATABASE, tableName); - FlinkCatalog flinkCatalog = (FlinkCatalog) getTableEnv().getCatalog(catalogName).get(); - assertThatThrownBy(() -> flinkCatalog.listPartitions(objectPath)) - .isInstanceOf(TableNotPartitionedException.class) - .hasMessageStartingWith("Table db.test_table in catalog") - .hasMessageEndingWith("is not partitioned."); - } - - @TestTemplate - public void testListPartitionsWithPartitionedTable() - throws TableNotExistException, TableNotPartitionedException { - sql( - "CREATE TABLE %s (id INT, data VARCHAR) PARTITIONED BY (data) " - + "with ('write.format.default'='%s')", - tableName, format.name()); - sql("INSERT INTO %s SELECT 1,'a'", tableName); - sql("INSERT INTO %s SELECT 2,'b'", tableName); - - ObjectPath objectPath = new ObjectPath(DATABASE, tableName); - FlinkCatalog flinkCatalog = (FlinkCatalog) getTableEnv().getCatalog(catalogName).get(); - List list = flinkCatalog.listPartitions(objectPath); - assertThat(list).hasSize(2); - List expected = Lists.newArrayList(); - CatalogPartitionSpec partitionSpec1 = new CatalogPartitionSpec(ImmutableMap.of("data", "a")); - CatalogPartitionSpec partitionSpec2 = new CatalogPartitionSpec(ImmutableMap.of("data", "b")); - expected.add(partitionSpec1); - expected.add(partitionSpec2); - assertThat(list).as("Should produce the expected catalog partition specs.").isEqualTo(expected); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java deleted file mode 100644 index 4b6ac25ab8e3..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -import java.time.Duration; -import java.util.Map; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.flink.configuration.Configuration; -import org.apache.iceberg.Table; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.Test; - -public class TestFlinkConfParser { - - @Test - public void testDurationConf() { - Map writeOptions = ImmutableMap.of("write-prop", "111s"); - - ConfigOption configOption = - ConfigOptions.key("conf-prop").durationType().noDefaultValue(); - Configuration flinkConf = new Configuration(); - flinkConf.setString(configOption.key(), "222s"); - - Table table = mock(Table.class); - when(table.properties()).thenReturn(ImmutableMap.of("table-prop", "333s")); - - FlinkConfParser confParser = new FlinkConfParser(table, writeOptions, flinkConf); - Duration defaultVal = Duration.ofMillis(999); - - Duration result = - confParser.durationConf().option("write-prop").defaultValue(defaultVal).parse(); - assertThat(result).isEqualTo(Duration.ofSeconds(111)); - - result = confParser.durationConf().flinkConfig(configOption).defaultValue(defaultVal).parse(); - assertThat(result).isEqualTo(Duration.ofSeconds(222)); - - result = confParser.durationConf().tableProperty("table-prop").defaultValue(defaultVal).parse(); - assertThat(result).isEqualTo(Duration.ofSeconds(333)); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java deleted file mode 100644 index f19f0b447878..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java +++ /dev/null @@ -1,463 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.time.Instant; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.util.List; -import java.util.Optional; -import java.util.stream.Collectors; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.Expressions; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.expressions.ApiExpressionUtils; -import org.apache.flink.table.expressions.CallExpression; -import org.apache.flink.table.expressions.Expression; -import org.apache.flink.table.expressions.FieldReferenceExpression; -import org.apache.flink.table.expressions.ResolvedExpression; -import org.apache.flink.table.expressions.UnresolvedCallExpression; -import org.apache.flink.table.expressions.UnresolvedReferenceExpression; -import org.apache.flink.table.expressions.ValueLiteralExpression; -import org.apache.flink.table.expressions.utils.ApiExpressionDefaultVisitor; -import org.apache.flink.table.functions.BuiltInFunctionDefinitions; -import org.apache.iceberg.expressions.And; -import org.apache.iceberg.expressions.BoundLiteralPredicate; -import org.apache.iceberg.expressions.Not; -import org.apache.iceberg.expressions.Or; -import org.apache.iceberg.expressions.UnboundPredicate; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.util.DateTimeUtil; -import org.apache.iceberg.util.Pair; -import org.junit.jupiter.api.Test; - -public class TestFlinkFilters { - - private static final ResolvedSchema RESOLVED_SCHEMA = - ResolvedSchema.of( - Column.physical("field1", DataTypes.INT()), - Column.physical("field2", DataTypes.BIGINT()), - Column.physical("field3", DataTypes.FLOAT()), - Column.physical("field4", DataTypes.DOUBLE()), - Column.physical("field5", DataTypes.STRING()), - Column.physical("field6", DataTypes.BOOLEAN()), - Column.physical("field7", DataTypes.BINARY(2)), - Column.physical("field8", DataTypes.DECIMAL(10, 2)), - Column.physical("field9", DataTypes.DATE()), - Column.physical("field10", DataTypes.TIME()), - Column.physical("field11", DataTypes.TIMESTAMP()), - Column.physical("field12", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE())); - - // A map list of fields and values used to verify the conversion of flink expression to iceberg - // expression - private static final List> FIELD_VALUE_LIST = - ImmutableList.of( - Pair.of("field1", 1), - Pair.of("field2", 2L), - Pair.of("field3", 3F), - Pair.of("field4", 4D), - Pair.of("field5", "iceberg"), - Pair.of("field6", true), - Pair.of("field7", new byte[] {'a', 'b'}), - Pair.of("field8", BigDecimal.valueOf(10.12)), - Pair.of("field9", DateTimeUtil.daysFromDate(LocalDate.now())), - Pair.of("field10", DateTimeUtil.microsFromTime(LocalTime.now())), - Pair.of("field11", DateTimeUtil.microsFromTimestamp(LocalDateTime.now())), - Pair.of("field12", DateTimeUtil.microsFromInstant(Instant.now()))); - - @Test - public void testFlinkDataTypeEqual() { - matchLiteral("field1", 1, 1); - matchLiteral("field2", 10L, 10L); - matchLiteral("field3", 1.2F, 1.2F); - matchLiteral("field4", 3.4D, 3.4D); - matchLiteral("field5", "abcd", "abcd"); - matchLiteral("field6", true, true); - matchLiteral("field7", new byte[] {'a', 'b'}, ByteBuffer.wrap(new byte[] {'a', 'b'})); - matchLiteral("field8", BigDecimal.valueOf(10.12), BigDecimal.valueOf(10.12)); - - LocalDate date = LocalDate.parse("2020-12-23"); - matchLiteral("field9", date, DateTimeUtil.daysFromDate(date)); - - LocalTime time = LocalTime.parse("12:13:14"); - matchLiteral("field10", time, DateTimeUtil.microsFromTime(time)); - - LocalDateTime dateTime = LocalDateTime.parse("2020-12-23T12:13:14"); - matchLiteral("field11", dateTime, DateTimeUtil.microsFromTimestamp(dateTime)); - - Instant instant = Instant.parse("2020-12-23T12:13:14.00Z"); - matchLiteral("field12", instant, DateTimeUtil.microsFromInstant(instant)); - } - - @Test - public void testEquals() { - for (Pair pair : FIELD_VALUE_LIST) { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.equal(pair.first(), pair.second()); - - Optional actual = - FlinkFilters.convert( - resolve(Expressions.$(pair.first()).isEqual(Expressions.lit(pair.second())))); - assertThat(actual).isPresent(); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert( - resolve(Expressions.lit(pair.second()).isEqual(Expressions.$(pair.first())))); - assertThat(actual1).isPresent(); - assertPredicatesMatch(expected, actual1.get()); - } - } - - @Test - public void testEqualsNaN() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.isNaN("field3"); - - Optional actual = - FlinkFilters.convert(resolve(Expressions.$("field3").isEqual(Expressions.lit(Float.NaN)))); - assertThat(actual).isPresent(); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(Float.NaN).isEqual(Expressions.$("field3")))); - assertThat(actual1).isPresent(); - assertPredicatesMatch(expected, actual1.get()); - } - - @Test - public void testNotEquals() { - for (Pair pair : FIELD_VALUE_LIST) { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.notEqual(pair.first(), pair.second()); - - Optional actual = - FlinkFilters.convert( - resolve(Expressions.$(pair.first()).isNotEqual(Expressions.lit(pair.second())))); - assertThat(actual).isPresent(); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert( - resolve(Expressions.lit(pair.second()).isNotEqual(Expressions.$(pair.first())))); - assertThat(actual1).isPresent(); - assertPredicatesMatch(expected, actual1.get()); - } - } - - @Test - public void testNotEqualsNaN() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.notNaN("field3"); - - Optional actual = - FlinkFilters.convert( - resolve(Expressions.$("field3").isNotEqual(Expressions.lit(Float.NaN)))); - assertThat(actual).isPresent(); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert( - resolve(Expressions.lit(Float.NaN).isNotEqual(Expressions.$("field3")))); - assertThat(actual1).isPresent(); - assertPredicatesMatch(expected, actual1.get()); - } - - @Test - public void testGreaterThan() { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.greaterThan("field1", 1); - - Optional actual = - FlinkFilters.convert(resolve(Expressions.$("field1").isGreater(Expressions.lit(1)))); - assertThat(actual).isPresent(); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(1).isLess(Expressions.$("field1")))); - assertThat(actual1).isPresent(); - assertPredicatesMatch(expected, actual1.get()); - } - - @Test - public void testGreaterThanEquals() { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.greaterThanOrEqual("field1", 1); - - Optional actual = - FlinkFilters.convert(resolve(Expressions.$("field1").isGreaterOrEqual(Expressions.lit(1)))); - assertThat(actual).isPresent(); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(1).isLessOrEqual(Expressions.$("field1")))); - assertThat(actual1).isPresent(); - assertPredicatesMatch(expected, actual1.get()); - } - - @Test - public void testLessThan() { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.lessThan("field1", 1); - - Optional actual = - FlinkFilters.convert(resolve(Expressions.$("field1").isLess(Expressions.lit(1)))); - assertThat(actual).isPresent(); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(1).isGreater(Expressions.$("field1")))); - assertThat(actual1).isPresent(); - assertPredicatesMatch(expected, actual1.get()); - } - - @Test - public void testLessThanEquals() { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.lessThanOrEqual("field1", 1); - - Optional actual = - FlinkFilters.convert(resolve(Expressions.$("field1").isLessOrEqual(Expressions.lit(1)))); - assertThat(actual).isPresent(); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(1).isGreaterOrEqual(Expressions.$("field1")))); - assertThat(actual1).isPresent(); - assertPredicatesMatch(expected, actual1.get()); - } - - @Test - public void testIsNull() { - Expression expr = resolve(Expressions.$("field1").isNull()); - Optional actual = FlinkFilters.convert(expr); - assertThat(actual).isPresent(); - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.isNull("field1"); - assertPredicatesMatch(expected, actual.get()); - } - - @Test - public void testIsNotNull() { - Expression expr = resolve(Expressions.$("field1").isNotNull()); - Optional actual = FlinkFilters.convert(expr); - assertThat(actual).isPresent(); - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.notNull("field1"); - assertPredicatesMatch(expected, actual.get()); - } - - @Test - public void testAnd() { - Expression expr = - resolve( - Expressions.$("field1") - .isEqual(Expressions.lit(1)) - .and(Expressions.$("field2").isEqual(Expressions.lit(2L)))); - Optional actual = FlinkFilters.convert(expr); - assertThat(actual).isPresent(); - And and = (And) actual.get(); - And expected = - (And) - org.apache.iceberg.expressions.Expressions.and( - org.apache.iceberg.expressions.Expressions.equal("field1", 1), - org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); - - assertPredicatesMatch(expected.left(), and.left()); - assertPredicatesMatch(expected.right(), and.right()); - } - - @Test - public void testOr() { - Expression expr = - resolve( - Expressions.$("field1") - .isEqual(Expressions.lit(1)) - .or(Expressions.$("field2").isEqual(Expressions.lit(2L)))); - Optional actual = FlinkFilters.convert(expr); - assertThat(actual).isPresent(); - Or or = (Or) actual.get(); - Or expected = - (Or) - org.apache.iceberg.expressions.Expressions.or( - org.apache.iceberg.expressions.Expressions.equal("field1", 1), - org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); - - assertPredicatesMatch(expected.left(), or.left()); - assertPredicatesMatch(expected.right(), or.right()); - } - - @Test - public void testNot() { - Expression expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.NOT, - Expressions.$("field1").isEqual(Expressions.lit(1)))); - Optional actual = FlinkFilters.convert(expr); - assertThat(actual).isPresent(); - Not not = (Not) actual.get(); - Not expected = - (Not) - org.apache.iceberg.expressions.Expressions.not( - org.apache.iceberg.expressions.Expressions.equal("field1", 1)); - - assertThat(not.op()).as("Predicate operation should match").isEqualTo(expected.op()); - assertPredicatesMatch(expected.child(), not.child()); - } - - @Test - public void testLike() { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.startsWith("field5", "abc"); - Expression expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("abc%"))); - Optional actual = FlinkFilters.convert(expr); - assertThat(actual).isPresent(); - assertPredicatesMatch(expected, actual.get()); - - expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%abc"))); - actual = FlinkFilters.convert(expr); - assertThat(actual).isNotPresent(); - - expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, - Expressions.$("field5"), - Expressions.lit("%abc%"))); - actual = FlinkFilters.convert(expr); - assertThat(actual).isNotPresent(); - - expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, - Expressions.$("field5"), - Expressions.lit("abc%d"))); - actual = FlinkFilters.convert(expr); - assertThat(actual).isNotPresent(); - - expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%"))); - actual = FlinkFilters.convert(expr); - assertThat(actual).isNotPresent(); - - expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a_"))); - actual = FlinkFilters.convert(expr); - assertThat(actual).isNotPresent(); - - expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a%b"))); - actual = FlinkFilters.convert(expr); - assertThat(actual).isNotPresent(); - } - - @SuppressWarnings("unchecked") - private void matchLiteral(String fieldName, Object flinkLiteral, T icebergLiteral) { - Expression expr = resolve(Expressions.$(fieldName).isEqual(Expressions.lit(flinkLiteral))); - Optional actual = FlinkFilters.convert(expr); - assertThat(actual).isPresent(); - org.apache.iceberg.expressions.Expression expression = actual.get(); - assertThat(expression) - .as("The expression should be a UnboundPredicate") - .isInstanceOf(UnboundPredicate.class); - UnboundPredicate unboundPredicate = (UnboundPredicate) expression; - - org.apache.iceberg.expressions.Expression expression1 = - unboundPredicate.bind(FlinkSchemaUtil.convert(RESOLVED_SCHEMA).asStruct(), false); - assertThat(expression1) - .as("The expression should be a BoundLiteralPredicate") - .isInstanceOf(BoundLiteralPredicate.class); - - BoundLiteralPredicate predicate = (BoundLiteralPredicate) expression1; - assertThat(predicate.test(icebergLiteral)).isTrue(); - } - - private static Expression resolve(Expression originalExpression) { - return originalExpression.accept( - new ApiExpressionDefaultVisitor<>() { - @Override - public Expression visit(UnresolvedReferenceExpression unresolvedReference) { - String name = unresolvedReference.getName(); - return RESOLVED_SCHEMA - .getColumn(name) - .map( - column -> { - int columnIndex = RESOLVED_SCHEMA.getColumns().indexOf(column); - return new FieldReferenceExpression( - name, column.getDataType(), 0, columnIndex); - }) - .orElse(null); - } - - @Override - public Expression visit(UnresolvedCallExpression unresolvedCall) { - List children = - unresolvedCall.getChildren().stream() - .map(e -> (ResolvedExpression) e.accept(this)) - .collect(Collectors.toList()); - return new CallExpression( - unresolvedCall.getFunctionDefinition(), children, DataTypes.STRING()); - } - - @Override - public Expression visit(ValueLiteralExpression valueLiteral) { - return valueLiteral; - } - - @Override - protected Expression defaultMethod(Expression expression) { - throw new UnsupportedOperationException( - String.format("unsupported expression: %s", expression)); - } - }); - } - - private void assertPredicatesMatch( - org.apache.iceberg.expressions.Expression expected, - org.apache.iceberg.expressions.Expression actual) { - assertThat(expected) - .as("The expected expression should be a UnboundPredicate") - .isInstanceOf(UnboundPredicate.class); - assertThat(actual) - .as("The actual expression should be a UnboundPredicate") - .isInstanceOf(UnboundPredicate.class); - UnboundPredicate predicateExpected = (UnboundPredicate) expected; - UnboundPredicate predicateActual = (UnboundPredicate) actual; - assertThat(predicateActual.op()).isEqualTo(predicateExpected.op()); - assertThat(predicateActual.literal()).isEqualTo(predicateExpected.literal()); - assertThat(predicateActual.ref().name()).isEqualTo(predicateExpected.ref().name()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java deleted file mode 100644 index 91343ab1ee72..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.Test; - -public class TestFlinkHiveCatalog extends TestBase { - - @Test - public void testCreateCatalogWithWarehouseLocation() throws IOException { - Map props = Maps.newHashMap(); - props.put("type", "iceberg"); - props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hive"); - props.put(CatalogProperties.URI, CatalogTestBase.getURI(hiveConf)); - - File warehouseDir = Files.createTempDirectory(temporaryDirectory, "junit").toFile(); - props.put(CatalogProperties.WAREHOUSE_LOCATION, "file://" + warehouseDir.getAbsolutePath()); - - checkSQLQuery(props, warehouseDir); - } - - @Test - public void testCreateCatalogWithHiveConfDir() throws IOException { - // Dump the hive conf into a local file. - File hiveConfDir = Files.createTempDirectory(temporaryDirectory, "junit").toFile(); - File hiveSiteXML = new File(hiveConfDir, "hive-site.xml"); - File warehouseDir = Files.createTempDirectory(temporaryDirectory, "junit").toFile(); - try (FileOutputStream fos = new FileOutputStream(hiveSiteXML)) { - Configuration newConf = new Configuration(hiveConf); - // Set another new directory which is different with the hive metastore's warehouse path. - newConf.set( - HiveConf.ConfVars.METASTOREWAREHOUSE.varname, "file://" + warehouseDir.getAbsolutePath()); - newConf.writeXml(fos); - } - assertThat(hiveSiteXML.toPath()).exists(); - - // Construct the catalog attributions. - Map props = Maps.newHashMap(); - props.put("type", "iceberg"); - props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hive"); - props.put(CatalogProperties.URI, CatalogTestBase.getURI(hiveConf)); - // Set the 'hive-conf-dir' instead of 'warehouse' - props.put(FlinkCatalogFactory.HIVE_CONF_DIR, hiveConfDir.getAbsolutePath()); - - checkSQLQuery(props, warehouseDir); - } - - private void checkSQLQuery(Map catalogProperties, File warehouseDir) - throws IOException { - sql("CREATE CATALOG test_catalog WITH %s", CatalogTestBase.toWithClause(catalogProperties)); - sql("USE CATALOG test_catalog"); - sql("CREATE DATABASE test_db"); - sql("USE test_db"); - sql("CREATE TABLE test_table(c1 INT, c2 STRING)"); - sql("INSERT INTO test_table SELECT 1, 'a'"); - - Path databasePath = warehouseDir.toPath().resolve("test_db.db"); - assertThat(databasePath).exists(); - - Path tablePath = databasePath.resolve("test_table"); - assertThat(tablePath).exists(); - - Path dataPath = tablePath.resolve("data"); - assertThat(dataPath).exists(); - assertThat(Files.list(dataPath).count()) - .as("Should have a .crc file and a .parquet file") - .isEqualTo(2); - - sql("DROP TABLE test_table"); - dropDatabase("test_db", false); - dropCatalog("test_catalog", false); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java deleted file mode 100644 index 6236dc6e2df2..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java +++ /dev/null @@ -1,474 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.util.Collections; -import java.util.List; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.api.ValidationException; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.catalog.UniqueConstraint; -import org.apache.flink.table.types.logical.BinaryType; -import org.apache.flink.table.types.logical.CharType; -import org.apache.flink.table.types.logical.LocalZonedTimestampType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.TimeType; -import org.apache.flink.table.types.logical.TimestampType; -import org.apache.flink.table.types.logical.VarBinaryType; -import org.apache.flink.table.types.logical.VarCharType; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestFlinkSchemaUtil { - - @Parameter private boolean isTableSchema; - - @Parameters(name = "isTableSchema={0}") - private static Object[][] parameters() { - return new Object[][] {{true}, {false}}; - } - - @TestTemplate - public void testConvertFlinkSchemaToIcebergSchema() { - ResolvedSchema flinkSchema = - ResolvedSchema.of( - Column.physical("id", DataTypes.INT().notNull()), - Column.physical("name", DataTypes.STRING()) /* optional by default */, - Column.physical("salary", DataTypes.DOUBLE().notNull()), - Column.physical( - "locations", - DataTypes.MAP( - DataTypes.STRING(), - DataTypes.ROW( - DataTypes.FIELD("posX", DataTypes.DOUBLE().notNull(), "X field"), - DataTypes.FIELD("posY", DataTypes.DOUBLE().notNull(), "Y field")))), - Column.physical("strArray", DataTypes.ARRAY(DataTypes.STRING()).nullable()), - Column.physical("intArray", DataTypes.ARRAY(DataTypes.INT()).nullable()), - Column.physical("char", DataTypes.CHAR(10).notNull()), - Column.physical("varchar", DataTypes.VARCHAR(10).notNull()), - Column.physical("boolean", DataTypes.BOOLEAN().nullable()), - Column.physical("tinyint", DataTypes.TINYINT()), - Column.physical("smallint", DataTypes.SMALLINT()), - Column.physical("bigint", DataTypes.BIGINT()), - Column.physical("varbinary", DataTypes.VARBINARY(10)), - Column.physical("binary", DataTypes.BINARY(10)), - Column.physical("time", DataTypes.TIME()), - Column.physical("timestampWithoutZone", DataTypes.TIMESTAMP()), - Column.physical("timestampWithZone", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()), - Column.physical("date", DataTypes.DATE()), - Column.physical("decimal", DataTypes.DECIMAL(2, 2)), - Column.physical("decimal2", DataTypes.DECIMAL(38, 2)), - Column.physical("decimal3", DataTypes.DECIMAL(10, 1)), - Column.physical("multiset", DataTypes.MULTISET(DataTypes.STRING().notNull()))); - - Schema icebergSchema = - new Schema( - Types.NestedField.required(0, "id", Types.IntegerType.get(), null), - Types.NestedField.optional(1, "name", Types.StringType.get(), null), - Types.NestedField.required(2, "salary", Types.DoubleType.get(), null), - Types.NestedField.optional( - 3, - "locations", - Types.MapType.ofOptional( - 24, - 25, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(22, "posX", Types.DoubleType.get(), "X field"), - Types.NestedField.required( - 23, "posY", Types.DoubleType.get(), "Y field")))), - Types.NestedField.optional( - 4, "strArray", Types.ListType.ofOptional(26, Types.StringType.get())), - Types.NestedField.optional( - 5, "intArray", Types.ListType.ofOptional(27, Types.IntegerType.get())), - Types.NestedField.required(6, "char", Types.StringType.get()), - Types.NestedField.required(7, "varchar", Types.StringType.get()), - Types.NestedField.optional(8, "boolean", Types.BooleanType.get()), - Types.NestedField.optional(9, "tinyint", Types.IntegerType.get()), - Types.NestedField.optional(10, "smallint", Types.IntegerType.get()), - Types.NestedField.optional(11, "bigint", Types.LongType.get()), - Types.NestedField.optional(12, "varbinary", Types.BinaryType.get()), - Types.NestedField.optional(13, "binary", Types.FixedType.ofLength(10)), - Types.NestedField.optional(14, "time", Types.TimeType.get()), - Types.NestedField.optional( - 15, "timestampWithoutZone", Types.TimestampType.withoutZone()), - Types.NestedField.optional(16, "timestampWithZone", Types.TimestampType.withZone()), - Types.NestedField.optional(17, "date", Types.DateType.get()), - Types.NestedField.optional(18, "decimal", Types.DecimalType.of(2, 2)), - Types.NestedField.optional(19, "decimal2", Types.DecimalType.of(38, 2)), - Types.NestedField.optional(20, "decimal3", Types.DecimalType.of(10, 1)), - Types.NestedField.optional( - 21, - "multiset", - Types.MapType.ofRequired(28, 29, Types.StringType.get(), Types.IntegerType.get()))); - - checkSchema(flinkSchema, icebergSchema); - } - - @TestTemplate - public void testMapField() { - ResolvedSchema flinkSchema = - ResolvedSchema.of( - Column.physical( - "map_int_long", - DataTypes.MAP(DataTypes.INT(), DataTypes.BIGINT()).notNull()) /* Required */, - Column.physical( - "map_int_array_string", - DataTypes.MAP(DataTypes.ARRAY(DataTypes.INT()), DataTypes.STRING())), - Column.physical( - "map_decimal_string", DataTypes.MAP(DataTypes.DECIMAL(10, 2), DataTypes.STRING())), - Column.physical( - "map_fields_fields", - DataTypes.MAP( - DataTypes.ROW( - DataTypes.FIELD("field_int", DataTypes.INT(), "doc - int"), - DataTypes.FIELD("field_string", DataTypes.STRING(), "doc - string")) - .notNull(), /* Required */ - DataTypes.ROW( - DataTypes.FIELD( - "field_array", - DataTypes.ARRAY(DataTypes.STRING()), - "doc - array")) - .notNull() /* Required */) - .notNull() /* Required */)); - - Schema icebergSchema = - new Schema( - Types.NestedField.required( - 0, - "map_int_long", - Types.MapType.ofOptional(4, 5, Types.IntegerType.get(), Types.LongType.get()), - null), - Types.NestedField.optional( - 1, - "map_int_array_string", - Types.MapType.ofOptional( - 7, - 8, - Types.ListType.ofOptional(6, Types.IntegerType.get()), - Types.StringType.get()), - null), - Types.NestedField.optional( - 2, - "map_decimal_string", - Types.MapType.ofOptional( - 9, 10, Types.DecimalType.of(10, 2), Types.StringType.get())), - Types.NestedField.required( - 3, - "map_fields_fields", - Types.MapType.ofRequired( - 15, - 16, - Types.StructType.of( - Types.NestedField.optional( - 11, "field_int", Types.IntegerType.get(), "doc - int"), - Types.NestedField.optional( - 12, "field_string", Types.StringType.get(), "doc - string")), - Types.StructType.of( - Types.NestedField.optional( - 14, - "field_array", - Types.ListType.ofOptional(13, Types.StringType.get()), - "doc - array"))))); - - checkSchema(flinkSchema, icebergSchema); - } - - @TestTemplate - public void testStructField() { - ResolvedSchema flinkSchema = - ResolvedSchema.of( - Column.physical( - "struct_int_string_decimal", - DataTypes.ROW( - DataTypes.FIELD("field_int", DataTypes.INT()), - DataTypes.FIELD("field_string", DataTypes.STRING()), - DataTypes.FIELD("field_decimal", DataTypes.DECIMAL(19, 2)), - DataTypes.FIELD( - "field_struct", - DataTypes.ROW( - DataTypes.FIELD("inner_struct_int", DataTypes.INT()), - DataTypes.FIELD( - "inner_struct_float_array", - DataTypes.ARRAY(DataTypes.FLOAT()))) - .notNull()) /* Row is required */) - .notNull()) /* Required */, - Column.physical( - "struct_map_int_int", - DataTypes.ROW( - DataTypes.FIELD( - "field_map", DataTypes.MAP(DataTypes.INT(), DataTypes.INT()))) - .nullable()) /* Optional */); - - Schema icebergSchema = - new Schema( - Types.NestedField.required( - 0, - "struct_int_string_decimal", - Types.StructType.of( - Types.NestedField.optional(5, "field_int", Types.IntegerType.get()), - Types.NestedField.optional(6, "field_string", Types.StringType.get()), - Types.NestedField.optional(7, "field_decimal", Types.DecimalType.of(19, 2)), - Types.NestedField.required( - 8, - "field_struct", - Types.StructType.of( - Types.NestedField.optional( - 3, "inner_struct_int", Types.IntegerType.get()), - Types.NestedField.optional( - 4, - "inner_struct_float_array", - Types.ListType.ofOptional(2, Types.FloatType.get())))))), - Types.NestedField.optional( - 1, - "struct_map_int_int", - Types.StructType.of( - Types.NestedField.optional( - 11, - "field_map", - Types.MapType.ofOptional( - 9, 10, Types.IntegerType.get(), Types.IntegerType.get()))))); - - checkSchema(flinkSchema, icebergSchema); - } - - @TestTemplate - public void testListField() { - ResolvedSchema flinkSchema = - ResolvedSchema.of( - Column.physical( - "list_struct_fields", - DataTypes.ARRAY(DataTypes.ROW(DataTypes.FIELD("field_int", DataTypes.INT()))) - .notNull()) /* Required */, - Column.physical( - "list_optional_struct_fields", - DataTypes.ARRAY( - DataTypes.ROW( - DataTypes.FIELD( - "field_timestamp_with_local_time_zone", - DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()))) - .nullable()) /* Optional */, - Column.physical( - "list_map_fields", - DataTypes.ARRAY( - DataTypes.MAP( - DataTypes.ARRAY( - DataTypes.INT().notNull()), /* Key of map must be required */ - DataTypes.ROW( - DataTypes.FIELD("field_0", DataTypes.INT(), "doc - int"))) - .notNull()) - .notNull()) /* Required */); - - Schema icebergSchema = - new Schema( - Types.NestedField.required( - 0, - "list_struct_fields", - Types.ListType.ofOptional( - 4, - Types.StructType.of( - Types.NestedField.optional(3, "field_int", Types.IntegerType.get())))), - Types.NestedField.optional( - 1, - "list_optional_struct_fields", - Types.ListType.ofOptional( - 6, - Types.StructType.of( - Types.NestedField.optional( - 5, - "field_timestamp_with_local_time_zone", - Types.TimestampType.withZone())))), - Types.NestedField.required( - 2, - "list_map_fields", - Types.ListType.ofRequired( - 11, - Types.MapType.ofOptional( - 9, - 10, - Types.ListType.ofRequired(7, Types.IntegerType.get()), - Types.StructType.of( - Types.NestedField.optional( - 8, "field_0", Types.IntegerType.get(), "doc - int")))))); - - checkSchema(flinkSchema, icebergSchema); - } - - private void checkSchema(ResolvedSchema flinkSchema, Schema icebergSchema) { - if (isTableSchema) { - assertThat(FlinkSchemaUtil.convert(TableSchema.fromResolvedSchema(flinkSchema)).asStruct()) - .isEqualTo(icebergSchema.asStruct()); - // The conversion is not a 1:1 mapping, so we just check iceberg types. - assertThat( - FlinkSchemaUtil.convert( - FlinkSchemaUtil.toSchema(FlinkSchemaUtil.convert(icebergSchema))) - .asStruct()) - .isEqualTo(icebergSchema.asStruct()); - } else { - assertThat(FlinkSchemaUtil.convert(flinkSchema).asStruct()) - .isEqualTo(icebergSchema.asStruct()); - // The conversion is not a 1:1 mapping, so we just check iceberg types. - assertThat( - FlinkSchemaUtil.convert( - FlinkSchemaUtil.toResolvedSchema(FlinkSchemaUtil.convert(icebergSchema))) - .asStruct()) - .isEqualTo(icebergSchema.asStruct()); - } - } - - @Test - public void testInconsistentTypes() { - checkInconsistentType( - Types.UUIDType.get(), new BinaryType(16), new BinaryType(16), Types.FixedType.ofLength(16)); - checkInconsistentType( - Types.StringType.get(), - new VarCharType(VarCharType.MAX_LENGTH), - new CharType(100), - Types.StringType.get()); - checkInconsistentType( - Types.BinaryType.get(), - new VarBinaryType(VarBinaryType.MAX_LENGTH), - new VarBinaryType(100), - Types.BinaryType.get()); - checkInconsistentType( - Types.TimeType.get(), new TimeType(), new TimeType(3), Types.TimeType.get()); - checkInconsistentType( - Types.TimestampType.withoutZone(), - new TimestampType(6), - new TimestampType(3), - Types.TimestampType.withoutZone()); - checkInconsistentType( - Types.TimestampType.withZone(), - new LocalZonedTimestampType(6), - new LocalZonedTimestampType(3), - Types.TimestampType.withZone()); - } - - private void checkInconsistentType( - Type icebergType, - LogicalType flinkExpectedType, - LogicalType flinkType, - Type icebergExpectedType) { - assertThat(FlinkSchemaUtil.convert(icebergType)).isEqualTo(flinkExpectedType); - assertThat(FlinkSchemaUtil.convert(FlinkSchemaUtil.toSchema(RowType.of(flinkType))).asStruct()) - .isEqualTo(Types.StructType.of(Types.NestedField.optional(0, "f0", icebergExpectedType))); - assertThat( - FlinkSchemaUtil.convert(FlinkSchemaUtil.toResolvedSchema(RowType.of(flinkType))) - .asStruct()) - .isEqualTo(Types.StructType.of(Types.NestedField.optional(0, "f0", icebergExpectedType))); - } - - @TestTemplate - public void testConvertFlinkSchemaBaseOnIcebergSchema() { - Schema baseSchema = - new Schema( - Lists.newArrayList( - Types.NestedField.required(101, "int", Types.IntegerType.get()), - Types.NestedField.optional(102, "string", Types.StringType.get())), - Sets.newHashSet(101)); - - Schema convertedSchema; - if (isTableSchema) { - TableSchema flinkSchema = - TableSchema.builder() - .field("int", DataTypes.INT().notNull()) - .field("string", DataTypes.STRING().nullable()) - .primaryKey("int") - .build(); - convertedSchema = FlinkSchemaUtil.convert(baseSchema, flinkSchema); - } else { - ResolvedSchema flinkSchema = - new ResolvedSchema( - List.of( - Column.physical("int", DataTypes.INT().notNull()), - Column.physical("string", DataTypes.STRING().nullable())), - Collections.emptyList(), - UniqueConstraint.primaryKey("pk", List.of("int"))); - convertedSchema = FlinkSchemaUtil.convert(baseSchema, flinkSchema); - } - - assertThat(convertedSchema.asStruct()).isEqualTo(baseSchema.asStruct()); - assertThat(convertedSchema.identifierFieldIds()).containsExactly(101); - } - - @TestTemplate - public void testConvertFlinkSchemaWithPrimaryKeys() { - Schema icebergSchema = - new Schema( - Lists.newArrayList( - Types.NestedField.required(1, "int", Types.IntegerType.get()), - Types.NestedField.required(2, "string", Types.StringType.get())), - Sets.newHashSet(1, 2)); - - if (isTableSchema) { - TableSchema tableSchema = FlinkSchemaUtil.toSchema(icebergSchema); - assertThat(tableSchema.getPrimaryKey()) - .isPresent() - .get() - .satisfies(k -> assertThat(k.getColumns()).containsExactly("int", "string")); - } else { - ResolvedSchema resolvedSchema = FlinkSchemaUtil.toResolvedSchema(icebergSchema); - assertThat(resolvedSchema.getPrimaryKey()) - .isPresent() - .get() - .satisfies(k -> assertThat(k.getColumns()).containsExactly("int", "string")); - } - } - - @TestTemplate - public void testConvertFlinkSchemaWithNestedColumnInPrimaryKeys() { - Schema icebergSchema = - new Schema( - Lists.newArrayList( - Types.NestedField.required( - 1, - "struct", - Types.StructType.of( - Types.NestedField.required(2, "inner", Types.IntegerType.get())))), - Sets.newHashSet(2)); - - if (isTableSchema) { - assertThatThrownBy(() -> FlinkSchemaUtil.toSchema(icebergSchema)) - .isInstanceOf(ValidationException.class) - .hasMessageStartingWith("Could not create a PRIMARY KEY") - .hasMessageContaining("Column 'struct.inner' does not exist."); - } else { - assertThatThrownBy(() -> FlinkSchemaUtil.toResolvedSchema(icebergSchema)) - .isInstanceOf(ValidationException.class) - .hasMessageStartingWith("Invalid primary key") - .hasMessageContaining("Column 'struct.inner' does not exist."); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java deleted file mode 100644 index d99f657a11cc..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.util.List; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.Expressions; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.source.BoundedTableFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; - -public class TestFlinkTableSink extends CatalogTestBase { - - private static final String TABLE_NAME = "test_table"; - private TableEnvironment tEnv; - private Table icebergTable; - - @Parameter(index = 2) - private FileFormat format; - - @Parameter(index = 3) - private boolean isStreamingJob; - - @Parameter(index = 4) - private boolean useV2Sink; - - @Parameters( - name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}, useV2Sink={4}") - public static List parameters() { - List parameters = Lists.newArrayList(); - for (FileFormat format : - new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { - for (Boolean isStreaming : new Boolean[] {true, false}) { - for (Object[] catalogParams : CatalogTestBase.parameters()) { - String catalogName = (String) catalogParams[0]; - Namespace baseNamespace = (Namespace) catalogParams[1]; - parameters.add( - new Object[] { - catalogName, baseNamespace, format, isStreaming, false /* don't use v2 sink */ - }); - } - } - } - - for (FileFormat format : - new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { - for (Boolean isStreaming : new Boolean[] {true, false}) { - String catalogName = "testhadoop_basenamespace"; - Namespace baseNamespace = Namespace.of("l0", "l1"); - parameters.add( - new Object[] {catalogName, baseNamespace, format, isStreaming, true /* use v2 sink */}); - } - } - - return parameters; - } - - @Override - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); - if (isStreamingJob) { - settingsBuilder.inStreamingMode(); - StreamExecutionEnvironment env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); - env.enableCheckpointing(400); - env.setMaxParallelism(2); - env.setParallelism(2); - tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); - } else { - settingsBuilder.inBatchMode(); - tEnv = TableEnvironment.create(settingsBuilder.build()); - } - } - } - - tEnv.getConfig() - .getConfiguration() - .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_V2_SINK, useV2Sink); - - return tEnv; - } - - @Override - @BeforeEach - public void before() { - super.before(); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - sql( - "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", - TABLE_NAME, format.name()); - icebergTable = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - } - - @Override - @AfterEach - public void clean() { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME); - dropDatabase(flinkDatabase, true); - BoundedTableFactory.clearDataSets(); - super.clean(); - } - - @TestTemplate - public void testInsertFromSourceTable() throws Exception { - // Register the rows into a temporary table. - getTableEnv() - .createTemporaryView( - "sourceTable", - getTableEnv() - .fromValues( - SimpleDataUtil.FLINK_SCHEMA.toSourceRowDataType(), - Expressions.row(1, "hello"), - Expressions.row(2, "world"), - Expressions.row(3, (String) null), - Expressions.row(null, "bar"))); - - // Redirect the records from source table to destination table. - sql("INSERT INTO %s SELECT id,data from sourceTable", TABLE_NAME); - - // Assert the table records as expected. - SimpleDataUtil.assertTableRecords( - icebergTable, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hello"), - SimpleDataUtil.createRecord(2, "world"), - SimpleDataUtil.createRecord(3, null), - SimpleDataUtil.createRecord(null, "bar"))); - } - - @TestTemplate - public void testOverwriteTable() throws Exception { - assumeThat(isStreamingJob) - .as("Flink unbounded streaming does not support overwrite operation") - .isFalse(); - - sql("INSERT INTO %s SELECT 1, 'a'", TABLE_NAME); - SimpleDataUtil.assertTableRecords( - icebergTable, Lists.newArrayList(SimpleDataUtil.createRecord(1, "a"))); - - sql("INSERT OVERWRITE %s SELECT 2, 'b'", TABLE_NAME); - SimpleDataUtil.assertTableRecords( - icebergTable, Lists.newArrayList(SimpleDataUtil.createRecord(2, "b"))); - } - - @TestTemplate - public void testReplacePartitions() throws Exception { - assumeThat(isStreamingJob) - .as("Flink unbounded streaming does not support overwrite operation") - .isFalse(); - String tableName = "test_partition"; - sql( - "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", - tableName, format.name()); - - try { - Table partitionedTable = - validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); - - sql("INSERT INTO %s SELECT 1, 'a'", tableName); - sql("INSERT INTO %s SELECT 2, 'b'", tableName); - sql("INSERT INTO %s SELECT 3, 'c'", tableName); - - SimpleDataUtil.assertTableRecords( - partitionedTable, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "b"), - SimpleDataUtil.createRecord(3, "c"))); - - sql("INSERT OVERWRITE %s SELECT 4, 'b'", tableName); - sql("INSERT OVERWRITE %s SELECT 5, 'a'", tableName); - - SimpleDataUtil.assertTableRecords( - partitionedTable, - Lists.newArrayList( - SimpleDataUtil.createRecord(5, "a"), - SimpleDataUtil.createRecord(4, "b"), - SimpleDataUtil.createRecord(3, "c"))); - - sql("INSERT OVERWRITE %s PARTITION (data='a') SELECT 6", tableName); - - SimpleDataUtil.assertTableRecords( - partitionedTable, - Lists.newArrayList( - SimpleDataUtil.createRecord(6, "a"), - SimpleDataUtil.createRecord(4, "b"), - SimpleDataUtil.createRecord(3, "c"))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } - - @TestTemplate - public void testInsertIntoPartition() throws Exception { - String tableName = "test_insert_into_partition"; - sql( - "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", - tableName, format.name()); - - try { - Table partitionedTable = - validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); - - // Full partition. - sql("INSERT INTO %s PARTITION (data='a') SELECT 1", tableName); - sql("INSERT INTO %s PARTITION (data='a') SELECT 2", tableName); - sql("INSERT INTO %s PARTITION (data='b') SELECT 3", tableName); - - SimpleDataUtil.assertTableRecords( - partitionedTable, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "a"), - SimpleDataUtil.createRecord(3, "b"))); - - // Partial partition. - sql("INSERT INTO %s SELECT 4, 'c'", tableName); - sql("INSERT INTO %s SELECT 5, 'd'", tableName); - - SimpleDataUtil.assertTableRecords( - partitionedTable, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "a"), - SimpleDataUtil.createRecord(3, "b"), - SimpleDataUtil.createRecord(4, "c"), - SimpleDataUtil.createRecord(5, "d"))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkCompaction.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkCompaction.java deleted file mode 100644 index 03d96ac2c573..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkCompaction.java +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.util.Arrays; -import java.util.List; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.java.typeutils.RowTypeInfo; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.util.DataFormatConverters; -import org.apache.flink.table.runtime.typeutils.ExternalTypeInfo; -import org.apache.flink.table.types.DataType; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.ManifestReader; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.source.BoundedTableFactory; -import org.apache.iceberg.flink.source.BoundedTestSource; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; - -public class TestFlinkTableSinkCompaction extends CatalogTestBase { - - private static final TypeInformation ROW_TYPE_INFO = - new RowTypeInfo( - SimpleDataUtil.FLINK_SCHEMA.getColumnDataTypes().stream() - .map(ExternalTypeInfo::of) - .toArray(TypeInformation[]::new)); - - private static final DataFormatConverters.RowConverter CONVERTER = - new DataFormatConverters.RowConverter( - SimpleDataUtil.FLINK_SCHEMA.getColumnDataTypes().toArray(DataType[]::new)); - - private static final String TABLE_NAME = "test_table"; - private StreamTableEnvironment tEnv; - private StreamExecutionEnvironment env; - private Table icebergTable; - private static final String TABLE_PROPERTIES = - "'flink-maintenance.lock.type'='jdbc'," - + "'flink-maintenance.lock.jdbc.uri'='jdbc:sqlite:file::memory:?ic'," - + "'flink-maintenance.lock.jdbc.init-lock-table'='true'," - + "'flink-maintenance.rewrite.rewrite-all'='true'," - + "'flink-maintenance.rewrite.schedule.data-file-size'='1'," - + "'flink-maintenance.lock-check-delay-seconds'='60'"; - - @Parameter(index = 2) - private boolean userSqlHint; - - @Parameters(name = "catalogName={0}, baseNamespace={1}, userSqlHint={2}") - public static List parameters() { - return Arrays.asList( - new Object[] {"testhadoop_basenamespace", Namespace.of("l0", "l1"), true}, - new Object[] {"testhadoop_basenamespace", Namespace.of("l0", "l1"), false}); - } - - @Override - protected StreamTableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); - settingsBuilder.inStreamingMode(); - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); - env.enableCheckpointing(100); - tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); - } - } - - tEnv.getConfig() - .getConfiguration() - .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_V2_SINK, true) - .set(FlinkWriteOptions.COMPACTION_ENABLE, true); - - return tEnv; - } - - @Override - @BeforeEach - public void before() { - super.before(); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - if (userSqlHint) { - sql("CREATE TABLE %s (id int, data varchar)", TABLE_NAME); - } else { - sql("CREATE TABLE %s (id int, data varchar) with (%s)", TABLE_NAME, TABLE_PROPERTIES); - } - - icebergTable = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - } - - @Override - @AfterEach - public void clean() { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME); - dropDatabase(flinkDatabase, true); - BoundedTableFactory.clearDataSets(); - super.clean(); - } - - @TestTemplate - public void testSQLCompactionE2e() throws Exception { - List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); - DataStream dataStream = - env.addSource(new BoundedTestSource<>(rows.toArray(new Row[0])), ROW_TYPE_INFO) - .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); - - getTableEnv().createTemporaryView("sourceTable", dataStream); - - // Redirect the records from source table to destination table. - if (userSqlHint) { - sql( - "INSERT INTO %s /*+ OPTIONS(%s) */ SELECT id,data from sourceTable", - TABLE_NAME, TABLE_PROPERTIES); - } else { - sql("INSERT INTO %s SELECT id,data from sourceTable", TABLE_NAME); - } - - // Assert the table records as expected. - SimpleDataUtil.assertTableRecords( - icebergTable, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hello"), - SimpleDataUtil.createRecord(2, "world"), - SimpleDataUtil.createRecord(3, "foo"))); - - // check the data file count after compact - List afterCompactDataFiles = - getDataFiles(icebergTable.currentSnapshot(), icebergTable); - assertThat(afterCompactDataFiles).hasSize(1); - - // check the data file count before compact - List preCompactDataFiles = - getDataFiles( - icebergTable.snapshot(icebergTable.currentSnapshot().parentId()), icebergTable); - assertThat(preCompactDataFiles).hasSize(3); - } - - private List getDataFiles(Snapshot snapshot, Table table) throws IOException { - List dataFiles = Lists.newArrayList(); - for (ManifestFile dataManifest : snapshot.dataManifests(table.io())) { - try (ManifestReader reader = ManifestFiles.read(dataManifest, table.io())) { - reader.iterator().forEachRemaining(dataFiles::add); - } - } - - return dataFiles; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java deleted file mode 100644 index 3afabf6e0795..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java +++ /dev/null @@ -1,388 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.flink.FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HADOOP; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.io.File; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import org.apache.flink.api.dag.Transformation; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.functions.sink.v2.DiscardingSink; -import org.apache.flink.streaming.api.transformations.SinkTransformation; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.table.api.internal.TableEnvironmentImpl; -import org.apache.flink.table.operations.ModifyOperation; -import org.apache.flink.table.planner.delegation.PlannerBase; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.sink.IcebergSink; -import org.apache.iceberg.flink.source.BoundedTableFactory; -import org.apache.iceberg.hadoop.HadoopCatalog; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -/** - * This class tests the more extended features of Flink sink. Extract them separately since it is - * unnecessary to test all the parameters combinations in {@link TestFlinkTableSink}, like catalog - * types, namespaces, file format, streaming/batch. Those combinations explode exponentially. Each - * test method in {@link TestFlinkTableSink} runs 21 combinations, which are expensive and slow. - */ -@ExtendWith(ParameterizedTestExtension.class) -public class TestFlinkTableSinkExtended extends SqlBase { - protected static final String CATALOG = "testhadoop"; - protected static final String DATABASE = "db"; - protected static final String TABLE = "tbl"; - - @RegisterExtension - public static MiniClusterExtension miniClusterResource = - MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); - - private static final String SOURCE_TABLE = "default_catalog.default_database.bounded_source"; - private static final String FLINK_DATABASE = CATALOG + "." + DATABASE; - private static final Namespace ICEBERG_NAMESPACE = Namespace.of(new String[] {DATABASE}); - - @TempDir protected File warehouseRoot; - - protected HadoopCatalog catalog = null; - - private TableEnvironment tEnv; - - @Parameter(index = 0) - protected boolean isStreamingJob; - - @Parameter(index = 1) - protected Boolean useV2Sink; - - @Parameters(name = "isStreamingJob={0}, useV2Sink={1}") - protected static List parameters() { - return Arrays.asList( - new Object[] {true, false}, - new Object[] {false, false}, - new Object[] {true, true}, - new Object[] {false, true}, - new Object[] {true, null}); - } - - protected synchronized TableEnvironment getTableEnv() { - if (tEnv == null) { - EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); - if (isStreamingJob) { - settingsBuilder.inStreamingMode(); - StreamExecutionEnvironment env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); - env.enableCheckpointing(400); - env.setMaxParallelism(2); - env.setParallelism(2); - tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); - } else { - settingsBuilder.inBatchMode(); - tEnv = TableEnvironment.create(settingsBuilder.build()); - } - } - - if (useV2Sink != null) { - tEnv.getConfig() - .getConfiguration() - .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_V2_SINK, useV2Sink); - } - - return tEnv; - } - - @BeforeEach - public void before() { - String warehouseLocation = "file:" + warehouseRoot.getPath(); - this.catalog = new HadoopCatalog(new Configuration(), warehouseLocation); - Map config = Maps.newHashMap(); - config.put("type", "iceberg"); - config.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, ICEBERG_CATALOG_TYPE_HADOOP); - config.put(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation); - sql("CREATE CATALOG %s WITH %s", CATALOG, toWithClause(config)); - - sql("CREATE DATABASE %s", FLINK_DATABASE); - sql("USE CATALOG %s", CATALOG); - sql("USE %s", DATABASE); - sql( - "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", - TABLE, FileFormat.PARQUET.name()); - } - - @AfterEach - public void clean() throws Exception { - sql("DROP TABLE IF EXISTS %s.%s", FLINK_DATABASE, TABLE); - dropDatabase(FLINK_DATABASE, true); - BoundedTableFactory.clearDataSets(); - - dropCatalog(CATALOG, true); - catalog.close(); - } - - @TestTemplate - public void testUsedFlinkSinkInterface() { - String dataId = BoundedTableFactory.registerDataSet(Collections.emptyList()); - sql( - "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" - + " WITH ('connector'='BoundedSource', 'data-id'='%s')", - SOURCE_TABLE, dataId); - - PlannerBase planner = (PlannerBase) ((TableEnvironmentImpl) getTableEnv()).getPlanner(); - String insertSQL = String.format("INSERT INTO %s SELECT * FROM %s", TABLE, SOURCE_TABLE); - ModifyOperation operation = (ModifyOperation) planner.getParser().parse(insertSQL).get(0); - Transformation transformation = - planner.translate(Collections.singletonList(operation)).get(0); - assertThat(transformation).as("Should use SinkV2 API").isInstanceOf(SinkTransformation.class); - SinkTransformation sinkTransformation = (SinkTransformation) transformation; - if (useV2Sink != null && useV2Sink) { - assertThat(sinkTransformation.getSink()) - .as("Should use SinkV2 API based implementation") - .isInstanceOf(IcebergSink.class); - } else { - assertThat(sinkTransformation.getSink()) - .as("Should use custom chain of StreamOperators terminated by DiscardingSink") - .isInstanceOf(DiscardingSink.class); - } - } - - @TestTemplate - public void testWriteParallelism() { - List dataSet = - IntStream.range(1, 1000) - .mapToObj(i -> ImmutableList.of(Row.of(i, "aaa"), Row.of(i, "bbb"), Row.of(i, "ccc"))) - .flatMap(List::stream) - .collect(Collectors.toList()); - String dataId = BoundedTableFactory.registerDataSet(ImmutableList.of(dataSet)); - sql( - "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" - + " WITH ('connector'='BoundedSource', 'data-id'='%s')", - SOURCE_TABLE, dataId); - - PlannerBase planner = (PlannerBase) ((TableEnvironmentImpl) getTableEnv()).getPlanner(); - String insertSQL = - String.format( - "INSERT INTO %s /*+ OPTIONS('write-parallelism'='1') */ SELECT * FROM %s", - TABLE, SOURCE_TABLE); - ModifyOperation operation = (ModifyOperation) planner.getParser().parse(insertSQL).get(0); - Transformation sink = planner.translate(Collections.singletonList(operation)).get(0); - if (useV2Sink != null && useV2Sink) { - assertThat(sink.getParallelism()).as("Should have the expected 1 parallelism.").isEqualTo(1); - Transformation writerInput = sink.getInputs().get(0); - assertThat(writerInput.getParallelism()) - .as("Should have the expected parallelism.") - .isEqualTo(isStreamingJob ? 2 : 4); - } else { - Transformation committer = sink.getInputs().get(0); - Transformation writer = committer.getInputs().get(0); - - assertThat(writer.getParallelism()) - .as("Should have the expected 1 parallelism.") - .isEqualTo(1); - Transformation writerInput = writer.getInputs().get(0); - assertThat(writerInput.getParallelism()) - .as("Should have the expected parallelism.") - .isEqualTo(isStreamingJob ? 2 : 4); - } - } - - @TestTemplate - public void testHashDistributeMode() throws Exception { - // Initialize a BoundedSource table to precisely emit those rows in only one checkpoint. - List dataSet = - IntStream.range(1, 1000) - .mapToObj(i -> ImmutableList.of(Row.of(i, "aaa"), Row.of(i, "bbb"), Row.of(i, "ccc"))) - .flatMap(List::stream) - .collect(Collectors.toList()); - String dataId = BoundedTableFactory.registerDataSet(ImmutableList.of(dataSet)); - sql( - "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" - + " WITH ('connector'='BoundedSource', 'data-id'='%s')", - SOURCE_TABLE, dataId); - - assertThat(sql("SELECT * FROM %s", SOURCE_TABLE)) - .as("Should have the expected rows in source table.") - .containsExactlyInAnyOrderElementsOf(dataSet); - - Map tableProps = - ImmutableMap.of( - "write.format.default", - FileFormat.PARQUET.name(), - TableProperties.WRITE_DISTRIBUTION_MODE, - DistributionMode.HASH.modeName()); - - String tableName = "test_hash_distribution_mode"; - sql( - "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH %s", - tableName, toWithClause(tableProps)); - - try { - // Insert data set. - sql("INSERT INTO %s SELECT * FROM %s", tableName, SOURCE_TABLE); - - assertThat(sql("SELECT * FROM %s", tableName)) - .as("Should have the expected rows in sink table.") - .containsExactlyInAnyOrderElementsOf(dataSet); - - // Sometimes we will have more than one checkpoint if we pass the auto checkpoint interval, - // thus producing multiple snapshots. Here we assert that each snapshot has only 1 file per - // partition. - Table table = catalog.loadTable(TableIdentifier.of(ICEBERG_NAMESPACE, tableName)); - Map> snapshotToDataFiles = SimpleDataUtil.snapshotToDataFiles(table); - for (List dataFiles : snapshotToDataFiles.values()) { - if (dataFiles.isEmpty()) { - continue; - } - - assertThat( - SimpleDataUtil.matchingPartitions( - dataFiles, table.spec(), ImmutableMap.of("data", "aaa"))) - .hasSize(1); - assertThat( - SimpleDataUtil.matchingPartitions( - dataFiles, table.spec(), ImmutableMap.of("data", "bbb"))) - .hasSize(1); - assertThat( - SimpleDataUtil.matchingPartitions( - dataFiles, table.spec(), ImmutableMap.of("data", "ccc"))) - .hasSize(1); - } - } finally { - sql("DROP TABLE IF EXISTS %s.%s", FLINK_DATABASE, tableName); - } - } - - @TestTemplate - public void testRangeDistributionPartitionColumn() { - // Range partitioner currently only works with streaming writes (with checkpoints) - assumeThat(isStreamingJob).isTrue(); - - // Initialize a BoundedSource table to precisely emit those rows in only one checkpoint. - List> rowsPerCheckpoint = - IntStream.range(1, 6) - .mapToObj( - checkpointId -> { - List charRows = Lists.newArrayList(); - // emit 26x10 rows for each checkpoint cycle - for (int i = 0; i < 10; ++i) { - for (char c = 'a'; c <= 'z'; c++) { - charRows.add(Row.of(c - 'a', String.valueOf(c))); - } - } - return charRows; - }) - .collect(Collectors.toList()); - List flattenedRows = - rowsPerCheckpoint.stream().flatMap(List::stream).collect(Collectors.toList()); - - String dataId = BoundedTableFactory.registerDataSet(rowsPerCheckpoint); - sql( - "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" - + " WITH ('connector'='BoundedSource', 'data-id'='%s')", - SOURCE_TABLE, dataId); - - assertThat(sql("SELECT * FROM %s", SOURCE_TABLE)) - .as("Should have the expected rows in source table.") - .containsExactlyInAnyOrderElementsOf(flattenedRows); - - Map tableProps = - ImmutableMap.of( - "write.format.default", - FileFormat.PARQUET.name(), - TableProperties.WRITE_DISTRIBUTION_MODE, - DistributionMode.RANGE.modeName()); - - String tableName = "test_hash_distribution_mode"; - sql( - "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH %s", - tableName, toWithClause(tableProps)); - - try { - // Insert data set. - sql("INSERT INTO %s SELECT * FROM %s", tableName, SOURCE_TABLE); - - assertThat(sql("SELECT * FROM %s", tableName)) - .as("Should have the expected rows in sink table.") - .containsExactlyInAnyOrderElementsOf(flattenedRows); - - Table table = catalog.loadTable(TableIdentifier.of(ICEBERG_NAMESPACE, tableName)); - // ordered in reverse timeline from the newest snapshot to the oldest snapshot - List snapshots = Lists.newArrayList(table.snapshots().iterator()); - // only keep the snapshots with added data files - snapshots = - snapshots.stream() - .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) - .collect(Collectors.toList()); - - // Sometimes we will have more checkpoints than the bounded source if we pass the - // auto checkpoint interval. Thus producing multiple snapshots. - assertThat(snapshots).hasSizeGreaterThanOrEqualTo(5); - - // It takes 2 checkpoint cycle for statistics collection and application - // of the globally aggregated statistics in the range partitioner. - // The last two checkpoints should have range shuffle applied - List rangePartitionedCycles = - snapshots.subList(snapshots.size() - 2, snapshots.size()); - - for (Snapshot snapshot : rangePartitionedCycles) { - List addedDataFiles = - Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); - // range partition results in each partition only assigned to one writer task - // maybe less than 26 partitions as BoundedSource doesn't always precisely - // control the checkpoint boundary. - // It is hard to precisely control the test condition in SQL tests. - // Here only minimal safe assertions are applied to avoid flakiness. - // If there are no shuffling, the number of data files could be as high as - // 26 * 4 as the default parallelism is set to 4 for the mini cluster. - assertThat(addedDataFiles).hasSizeLessThanOrEqualTo(26); - } - } finally { - sql("DROP TABLE IF EXISTS %s.%s", FLINK_DATABASE, tableName); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java deleted file mode 100644 index c5a7ec4beec6..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java +++ /dev/null @@ -1,336 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.time.LocalDate; -import java.util.List; -import java.util.Map; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.types.Row; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.Timeout; - -@Timeout(60) -public class TestFlinkUpsert extends CatalogTestBase { - - @Parameter(index = 2) - private FileFormat format; - - @Parameter(index = 3) - private boolean isStreamingJob; - - private final Map tableUpsertProps = Maps.newHashMap(); - private TableEnvironment tEnv; - - @Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") - public static List parameters() { - List parameters = Lists.newArrayList(); - for (FileFormat format : - new FileFormat[] {FileFormat.PARQUET, FileFormat.AVRO, FileFormat.ORC}) { - for (Boolean isStreaming : new Boolean[] {true, false}) { - // Only test with one catalog as this is a file operation concern. - // FlinkCatalogTestBase requires the catalog name start with testhadoop if using hadoop - // catalog. - String catalogName = "testhadoop"; - Namespace baseNamespace = Namespace.of("default"); - parameters.add(new Object[] {catalogName, baseNamespace, format, isStreaming}); - } - } - return parameters; - } - - @Override - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); - if (isStreamingJob) { - settingsBuilder.inStreamingMode(); - StreamExecutionEnvironment env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); - env.enableCheckpointing(400); - env.setMaxParallelism(2); - env.setParallelism(2); - tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); - } else { - settingsBuilder.inBatchMode(); - tEnv = TableEnvironment.create(settingsBuilder.build()); - } - } - } - return tEnv; - } - - @Override - @BeforeEach - public void before() { - super.before(); - sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - tableUpsertProps.put(TableProperties.FORMAT_VERSION, "2"); - tableUpsertProps.put(TableProperties.UPSERT_ENABLED, "true"); - tableUpsertProps.put(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - } - - @Override - @AfterEach - public void clean() { - dropDatabase(flinkDatabase, true); - super.clean(); - } - - @TestTemplate - public void testUpsertAndQuery() { - String tableName = "test_upsert_query"; - LocalDate dt20220301 = LocalDate.of(2022, 3, 1); - LocalDate dt20220302 = LocalDate.of(2022, 3, 2); - - sql( - "CREATE TABLE %s(id INT NOT NULL, name STRING NOT NULL, dt DATE, PRIMARY KEY(id,dt) NOT ENFORCED) " - + "PARTITIONED BY (dt) WITH %s", - tableName, toWithClause(tableUpsertProps)); - - try { - sql( - "INSERT INTO %s VALUES " - + "(1, 'Bill', DATE '2022-03-01')," - + "(1, 'Jane', DATE '2022-03-01')," - + "(2, 'Jane', DATE '2022-03-01')", - tableName); - - sql( - "INSERT INTO %s VALUES " - + "(2, 'Bill', DATE '2022-03-01')," - + "(1, 'Jane', DATE '2022-03-02')," - + "(2, 'Jane', DATE '2022-03-02')", - tableName); - - List rowsOn20220301 = - Lists.newArrayList(Row.of(1, "Jane", dt20220301), Row.of(2, "Bill", dt20220301)); - TestHelpers.assertRows( - sql("SELECT * FROM %s WHERE dt < '2022-03-02'", tableName), rowsOn20220301); - - List rowsOn20220302 = - Lists.newArrayList(Row.of(1, "Jane", dt20220302), Row.of(2, "Jane", dt20220302)); - TestHelpers.assertRows( - sql("SELECT * FROM %s WHERE dt = '2022-03-02'", tableName), rowsOn20220302); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Iterables.concat(rowsOn20220301, rowsOn20220302))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } - - @TestTemplate - public void testUpsertOptions() { - String tableName = "test_upsert_options"; - LocalDate dt20220301 = LocalDate.of(2022, 3, 1); - LocalDate dt20220302 = LocalDate.of(2022, 3, 2); - - Map optionsUpsertProps = Maps.newHashMap(tableUpsertProps); - optionsUpsertProps.remove(TableProperties.UPSERT_ENABLED); - sql( - "CREATE TABLE %s(id INT NOT NULL, name STRING NOT NULL, dt DATE, PRIMARY KEY(id,dt) NOT ENFORCED) " - + "PARTITIONED BY (dt) WITH %s", - tableName, toWithClause(optionsUpsertProps)); - - try { - sql( - "INSERT INTO %s /*+ OPTIONS('upsert-enabled'='true')*/ VALUES " - + "(1, 'Bill', DATE '2022-03-01')," - + "(1, 'Jane', DATE '2022-03-01')," - + "(2, 'Jane', DATE '2022-03-01')", - tableName); - - sql( - "INSERT INTO %s /*+ OPTIONS('upsert-enabled'='true')*/ VALUES " - + "(2, 'Bill', DATE '2022-03-01')," - + "(1, 'Jane', DATE '2022-03-02')," - + "(2, 'Jane', DATE '2022-03-02')", - tableName); - - List rowsOn20220301 = - Lists.newArrayList(Row.of(1, "Jane", dt20220301), Row.of(2, "Bill", dt20220301)); - TestHelpers.assertRows( - sql("SELECT * FROM %s WHERE dt < '2022-03-02'", tableName), rowsOn20220301); - - List rowsOn20220302 = - Lists.newArrayList(Row.of(1, "Jane", dt20220302), Row.of(2, "Jane", dt20220302)); - TestHelpers.assertRows( - sql("SELECT * FROM %s WHERE dt = '2022-03-02'", tableName), rowsOn20220302); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Iterables.concat(rowsOn20220301, rowsOn20220302))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } - - @TestTemplate - public void testPrimaryKeyEqualToPartitionKey() { - // This is an SQL based reproduction of TestFlinkIcebergSinkV2#testUpsertOnDataKey - String tableName = "upsert_on_id_key"; - try { - sql( - "CREATE TABLE %s(id INT NOT NULL, name STRING NOT NULL, PRIMARY KEY(id) NOT ENFORCED) " - + "PARTITIONED BY (id) WITH %s", - tableName, toWithClause(tableUpsertProps)); - - sql("INSERT INTO %s VALUES " + "(1, 'Bill')," + "(1, 'Jane')," + "(2, 'Bill')", tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of(1, "Jane"), Row.of(2, "Bill"))); - - sql("INSERT INTO %s VALUES " + "(1, 'Bill')," + "(2, 'Jane')", tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of(1, "Bill"), Row.of(2, "Jane"))); - - sql("INSERT INTO %s VALUES " + "(3, 'Bill')," + "(4, 'Jane')", tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList( - Row.of(1, "Bill"), Row.of(2, "Jane"), Row.of(3, "Bill"), Row.of(4, "Jane"))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } - - @TestTemplate - public void testPrimaryKeyFieldsAtBeginningOfSchema() { - String tableName = "upsert_on_pk_at_schema_start"; - LocalDate dt = LocalDate.of(2022, 3, 1); - try { - sql( - "CREATE TABLE %s(id INT, dt DATE NOT NULL, name STRING NOT NULL, PRIMARY KEY(id,dt) NOT ENFORCED) " - + "PARTITIONED BY (dt) WITH %s", - tableName, toWithClause(tableUpsertProps)); - - sql( - "INSERT INTO %s VALUES " - + "(1, DATE '2022-03-01', 'Andy')," - + "(1, DATE '2022-03-01', 'Bill')," - + "(2, DATE '2022-03-01', 'Jane')", - tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of(1, dt, "Bill"), Row.of(2, dt, "Jane"))); - - sql( - "INSERT INTO %s VALUES " - + "(1, DATE '2022-03-01', 'Jane')," - + "(2, DATE '2022-03-01', 'Bill')", - tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of(1, dt, "Jane"), Row.of(2, dt, "Bill"))); - - sql( - "INSERT INTO %s VALUES " - + "(3, DATE '2022-03-01', 'Duke')," - + "(4, DATE '2022-03-01', 'Leon')", - tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList( - Row.of(1, dt, "Jane"), - Row.of(2, dt, "Bill"), - Row.of(3, dt, "Duke"), - Row.of(4, dt, "Leon"))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } - - @TestTemplate - public void testPrimaryKeyFieldsAtEndOfTableSchema() { - // This is the same test case as testPrimaryKeyFieldsAtBeginningOfSchema, but the primary key - // fields - // are located at the end of the flink schema. - String tableName = "upsert_on_pk_at_schema_end"; - LocalDate dt = LocalDate.of(2022, 3, 1); - try { - sql( - "CREATE TABLE %s(name STRING NOT NULL, id INT, dt DATE NOT NULL, PRIMARY KEY(id,dt) NOT ENFORCED) " - + "PARTITIONED BY (dt) WITH %s", - tableName, toWithClause(tableUpsertProps)); - - sql( - "INSERT INTO %s VALUES " - + "('Andy', 1, DATE '2022-03-01')," - + "('Bill', 1, DATE '2022-03-01')," - + "('Jane', 2, DATE '2022-03-01')", - tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of("Bill", 1, dt), Row.of("Jane", 2, dt))); - - sql( - "INSERT INTO %s VALUES " - + "('Jane', 1, DATE '2022-03-01')," - + "('Bill', 2, DATE '2022-03-01')", - tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of("Jane", 1, dt), Row.of("Bill", 2, dt))); - - sql( - "INSERT INTO %s VALUES " - + "('Duke', 3, DATE '2022-03-01')," - + "('Leon', 4, DATE '2022-03-01')", - tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList( - Row.of("Jane", 1, dt), - Row.of("Bill", 2, dt), - Row.of("Duke", 3, dt), - Row.of("Leon", 4, dt))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java deleted file mode 100644 index 26aa9d2b4c58..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java +++ /dev/null @@ -1,669 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.time.OffsetDateTime; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import java.util.function.Consumer; -import java.util.stream.Collectors; -import org.apache.avro.generic.GenericData; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.data.conversion.DataStructureConverter; -import org.apache.flink.table.data.conversion.DataStructureConverters; -import org.apache.flink.table.runtime.typeutils.InternalSerializers; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.flink.types.Row; -import org.apache.iceberg.ContentFile; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.data.GenericDataUtil; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.data.RowDataUtil; -import org.apache.iceberg.flink.source.FlinkInputFormat; -import org.apache.iceberg.flink.source.FlinkInputSplit; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Streams; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.DateTimeUtil; - -public class TestHelpers { - private TestHelpers() {} - - public static T roundTripKryoSerialize(Class clazz, T table) throws IOException { - KryoSerializer kryo = new KryoSerializer<>(clazz, new ExecutionConfig()); - - DataOutputSerializer outputView = new DataOutputSerializer(1024); - kryo.serialize(table, outputView); - - DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); - return kryo.deserialize(inputView); - } - - public static RowData copyRowData(RowData from, RowType rowType) { - TypeSerializer[] fieldSerializers = - rowType.getChildren().stream() - .map((LogicalType type) -> InternalSerializers.create(type)) - .toArray(TypeSerializer[]::new); - RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[rowType.getFieldCount()]; - for (int i = 0; i < rowType.getFieldCount(); ++i) { - fieldGetters[i] = FlinkRowData.createFieldGetter(rowType.getTypeAt(i), i); - } - - return RowDataUtil.clone(from, null, rowType, fieldSerializers, fieldGetters); - } - - public static void readRowData(FlinkInputFormat input, Consumer visitor) - throws IOException { - for (FlinkInputSplit s : input.createInputSplits(0)) { - input.open(s); - try { - while (!input.reachedEnd()) { - RowData row = input.nextRecord(null); - visitor.accept(row); - } - } finally { - input.close(); - } - } - } - - public static List readRowData(FlinkInputFormat inputFormat, RowType rowType) - throws IOException { - List results = Lists.newArrayList(); - readRowData(inputFormat, row -> results.add(copyRowData(row, rowType))); - return results; - } - - public static List readRows(FlinkInputFormat inputFormat, RowType rowType) - throws IOException { - return convertRowDataToRow(readRowData(inputFormat, rowType), rowType); - } - - public static List convertRowDataToRow(List rowDataList, RowType rowType) { - DataStructureConverter converter = - DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(rowType)); - return rowDataList.stream() - .map(converter::toExternal) - .map(Row.class::cast) - .collect(Collectors.toList()); - } - - public static List convertRecordToRow(List expectedRecords, Schema schema) { - List expected = Lists.newArrayList(); - @SuppressWarnings("unchecked") - DataStructureConverter converter = - (DataStructureConverter) - DataStructureConverters.getConverter( - TypeConversions.fromLogicalToDataType(FlinkSchemaUtil.convert(schema))); - expectedRecords.forEach( - r -> expected.add(converter.toExternal(RowDataConverter.convert(schema, r)))); - return expected; - } - - public static void assertRecordsWithOrder( - List results, List expectedRecords, Schema schema) { - List expected = convertRecordToRow(expectedRecords, schema); - assertRowsWithOrder(results, expected); - } - - public static void assertRecords(List results, List expectedRecords, Schema schema) { - List expected = convertRecordToRow(expectedRecords, schema); - assertRows(results, expected); - } - - public static void assertRows(List results, List expected, RowType rowType) { - assertRows(convertRowDataToRow(results, rowType), convertRowDataToRow(expected, rowType)); - } - - public static void assertRows(List results, List expected) { - assertThat(results).containsExactlyInAnyOrderElementsOf(expected); - } - - public static void assertRowsWithOrder(List results, List expected) { - assertThat(results).containsExactlyElementsOf(expected); - } - - public static void assertRowData(Schema schema, StructLike expected, RowData actual) { - assertRowData(schema.asStruct(), FlinkSchemaUtil.convert(schema), expected, actual); - } - - public static void assertRowData( - Types.StructType structType, - LogicalType rowType, - StructLike expectedRecord, - RowData actualRowData) { - if (expectedRecord == null && actualRowData == null) { - return; - } - - assertThat(expectedRecord).isNotNull(); - assertThat(actualRowData).isNotNull(); - - List types = Lists.newArrayList(); - for (Types.NestedField field : structType.fields()) { - types.add(field.type()); - } - - if (expectedRecord instanceof Record) { - Record expected = (Record) expectedRecord; - Types.StructType expectedType = expected.struct(); - int pos = 0; - for (Types.NestedField field : structType.fields()) { - Types.NestedField expectedField = expectedType.field(field.fieldId()); - LogicalType logicalType = ((RowType) rowType).getTypeAt(pos); - Object actualValue = - FlinkRowData.createFieldGetter(logicalType, pos).getFieldOrNull(actualRowData); - if (expectedField != null) { - assertEquals( - field.type(), logicalType, expected.getField(expectedField.name()), actualValue); - } else { - // convert the initial value to generic because that is the data model used to generate - // the expected records - assertEquals( - field.type(), - logicalType, - GenericDataUtil.internalToGeneric(field.type(), field.initialDefault()), - actualValue); - } - pos += 1; - } - - } else { - for (int i = 0; i < types.size(); i += 1) { - LogicalType logicalType = ((RowType) rowType).getTypeAt(i); - Object expected = expectedRecord.get(i, Object.class); - Object actual = - FlinkRowData.createFieldGetter(logicalType, i).getFieldOrNull(actualRowData); - assertEquals(types.get(i), logicalType, expected, actual); - } - } - } - - private static void assertEquals( - Type type, LogicalType logicalType, Object expected, Object actual) { - - if (expected == null && actual == null) { - return; - } - - assertThat(expected).isNotNull(); - assertThat(actual).isNotNull(); - - switch (type.typeId()) { - case BOOLEAN: - assertThat(actual).as("boolean value should be equal").isEqualTo(expected); - break; - case INTEGER: - assertThat(actual).as("int value should be equal").isEqualTo(expected); - break; - case LONG: - assertThat(actual).as("long value should be equal").isEqualTo(expected); - break; - case FLOAT: - assertThat(actual).as("float value should be equal").isEqualTo(expected); - break; - case DOUBLE: - assertThat(actual).as("double value should be equal").isEqualTo(expected); - break; - case STRING: - assertThat(expected).as("Should expect a CharSequence").isInstanceOf(CharSequence.class); - assertThat(actual.toString()) - .as("string should be equal") - .isEqualTo(String.valueOf(expected)); - break; - case DATE: - assertThat(expected).as("Should expect a Date").isInstanceOf(LocalDate.class); - LocalDate date = DateTimeUtil.dateFromDays((int) actual); - assertThat(date).as("date should be equal").isEqualTo(expected); - break; - case TIME: - assertThat(expected).as("Should expect a LocalTime").isInstanceOf(LocalTime.class); - int milliseconds = (int) (((LocalTime) expected).toNanoOfDay() / 1000_000); - assertThat(actual).as("time millis should be equal").isEqualTo(milliseconds); - break; - case TIMESTAMP: - if (((Types.TimestampType) type).shouldAdjustToUTC()) { - assertThat(expected) - .as("Should expect a OffsetDataTime") - .isInstanceOf(OffsetDateTime.class); - OffsetDateTime ts = (OffsetDateTime) expected; - assertThat(((TimestampData) actual).toLocalDateTime()) - .as("OffsetDataTime should be equal") - .isEqualTo(ts.toLocalDateTime()); - } else { - assertThat(expected) - .as("Should expect a LocalDataTime") - .isInstanceOf(LocalDateTime.class); - LocalDateTime ts = (LocalDateTime) expected; - assertThat(((TimestampData) actual).toLocalDateTime()) - .as("LocalDataTime should be equal") - .isEqualTo(ts); - } - break; - case TIMESTAMP_NANO: - if (((Types.TimestampNanoType) type).shouldAdjustToUTC()) { - assertThat(expected) - .as("Should expect a OffsetDataTime") - .isInstanceOf(OffsetDateTime.class); - OffsetDateTime ts = (OffsetDateTime) expected; - assertThat(((TimestampData) actual).toLocalDateTime()) - .as("OffsetDataTime should be equal") - .isEqualTo(ts.toLocalDateTime()); - } else { - assertThat(expected) - .as("Should expect a LocalDataTime") - .isInstanceOf(LocalDateTime.class); - LocalDateTime ts = (LocalDateTime) expected; - assertThat(((TimestampData) actual).toLocalDateTime()) - .as("LocalDataTime should be equal") - .isEqualTo(ts); - } - break; - case BINARY: - assertThat(ByteBuffer.wrap((byte[]) actual)) - .as("Should expect a ByteBuffer") - .isInstanceOf(ByteBuffer.class) - .isEqualTo(expected); - break; - case DECIMAL: - assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); - BigDecimal bd = (BigDecimal) expected; - assertThat(((DecimalData) actual).toBigDecimal()) - .as("decimal value should be equal") - .isEqualTo(bd); - break; - case LIST: - assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); - Collection expectedArrayData = (Collection) expected; - ArrayData actualArrayData = (ArrayData) actual; - LogicalType elementType = ((ArrayType) logicalType).getElementType(); - assertThat(actualArrayData.size()) - .as("array length should be equal") - .isEqualTo(expectedArrayData.size()); - assertArrayValues( - type.asListType().elementType(), elementType, expectedArrayData, actualArrayData); - break; - case MAP: - assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); - assertMapValues(type.asMapType(), logicalType, (Map) expected, (MapData) actual); - break; - case STRUCT: - assertThat(expected).as("Should expect a Record").isInstanceOf(StructLike.class); - assertRowData(type.asStructType(), logicalType, (StructLike) expected, (RowData) actual); - break; - case UUID: - assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); - ByteBuffer bb = ByteBuffer.wrap((byte[]) actual); - long firstLong = bb.getLong(); - long secondLong = bb.getLong(); - assertThat(new UUID(firstLong, secondLong).toString()) - .as("UUID should be equal") - .isEqualTo(expected.toString()); - break; - case FIXED: - assertThat(actual) - .as("Should expect byte[]") - .isInstanceOf(byte[].class) - .isEqualTo(expected); - break; - default: - throw new IllegalArgumentException("Not a supported type: " + type); - } - } - - public static void assertEquals(Schema schema, List records, List rows) { - Streams.forEachPair( - records.stream(), rows.stream(), (record, row) -> assertEquals(schema, record, row)); - } - - public static void assertEquals(Schema schema, GenericData.Record record, Row row) { - List fields = schema.asStruct().fields(); - assertThat(fields).hasSameSizeAs(record.getSchema().getFields()); - assertThat(fields).hasSize(row.getArity()); - - RowType rowType = FlinkSchemaUtil.convert(schema); - for (int i = 0; i < fields.size(); ++i) { - Type fieldType = fields.get(i).type(); - Object expectedValue = record.get(i); - Object actualValue = row.getField(i); - LogicalType logicalType = rowType.getTypeAt(i); - assertAvroEquals(fieldType, logicalType, expectedValue, actualValue); - } - } - - private static void assertEquals(Types.StructType struct, GenericData.Record record, Row row) { - List fields = struct.fields(); - for (int i = 0; i < fields.size(); i += 1) { - Type fieldType = fields.get(i).type(); - Object expectedValue = record.get(i); - Object actualValue = row.getField(i); - assertAvroEquals(fieldType, null, expectedValue, actualValue); - } - } - - private static void assertAvroEquals( - Type type, LogicalType logicalType, Object expected, Object actual) { - - if (expected == null && actual == null) { - return; - } - assertThat(expected).isNotNull(); - assertThat(actual).isNotNull(); - - switch (type.typeId()) { - case BOOLEAN: - case INTEGER: - case LONG: - case FLOAT: - case DOUBLE: - assertThat(expected) - .as("Should expect a " + type.typeId().javaClass()) - .isInstanceOf(type.typeId().javaClass()); - assertThat(actual) - .as("Should expect a " + type.typeId().javaClass()) - .isInstanceOf(type.typeId().javaClass()); - assertThat(actual).as(type.typeId() + " value should be equal").isEqualTo(expected); - break; - case STRING: - assertThat(expected).as("Should expect a CharSequence").isInstanceOf(CharSequence.class); - assertThat(actual).as("Should expect a CharSequence").isInstanceOf(CharSequence.class); - assertThat(actual.toString()).as("string should be equal").isEqualTo(expected.toString()); - break; - case DATE: - assertThat(expected).as("Should expect a Date").isInstanceOf(LocalDate.class); - LocalDate date = DateTimeUtil.dateFromDays((int) actual); - assertThat(date).as("date should be equal").isEqualTo(expected); - break; - case TIME: - assertThat(expected).as("Should expect a LocalTime").isInstanceOf(LocalTime.class); - int milliseconds = (int) (((LocalTime) expected).toNanoOfDay() / 1000_000); - assertThat(actual).as("time millis should be equal").isEqualTo(milliseconds); - break; - case TIMESTAMP: - if (((Types.TimestampType) type).shouldAdjustToUTC()) { - assertThat(expected) - .as("Should expect a OffsetDataTime") - .isInstanceOf(OffsetDateTime.class); - OffsetDateTime ts = (OffsetDateTime) expected; - assertThat(((TimestampData) actual).toLocalDateTime()) - .as("OffsetDataTime should be equal") - .isEqualTo(ts.toLocalDateTime()); - } else { - assertThat(expected) - .as("Should expect a LocalDataTime") - .isInstanceOf(LocalDateTime.class); - LocalDateTime ts = (LocalDateTime) expected; - assertThat(((TimestampData) actual).toLocalDateTime()) - .as("LocalDataTime should be equal") - .isEqualTo(ts); - } - break; - case BINARY: - assertThat(ByteBuffer.wrap((byte[]) actual)) - .as("Should expect a ByteBuffer") - .isInstanceOf(ByteBuffer.class) - .isEqualTo(expected); - break; - case DECIMAL: - assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); - BigDecimal bd = (BigDecimal) expected; - assertThat(((DecimalData) actual).toBigDecimal()) - .as("decimal value should be equal") - .isEqualTo(bd); - break; - case LIST: - assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); - Collection expectedArrayData = (Collection) expected; - ArrayData actualArrayData; - try { - actualArrayData = (ArrayData) actual; - } catch (ClassCastException e) { - actualArrayData = new GenericArrayData((Object[]) actual); - } - LogicalType elementType = ((ArrayType) logicalType).getElementType(); - assertThat(actualArrayData.size()) - .as("array length should be equal") - .isEqualTo(expectedArrayData.size()); - assertArrayValues( - type.asListType().elementType(), elementType, expectedArrayData, actualArrayData); - break; - case MAP: - assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); - MapData actualMap; - try { - actualMap = (MapData) actual; - } catch (ClassCastException e) { - actualMap = new GenericMapData((Map) actual); - } - assertMapValues(type.asMapType(), logicalType, (Map) expected, actualMap); - break; - case STRUCT: - assertThat(expected).as("Should expect a Record").isInstanceOf(GenericData.Record.class); - assertEquals( - type.asNestedType().asStructType(), (GenericData.Record) expected, (Row) actual); - break; - case UUID: - assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); - ByteBuffer bb = ByteBuffer.wrap((byte[]) actual); - long firstLong = bb.getLong(); - long secondLong = bb.getLong(); - assertThat(new UUID(firstLong, secondLong).toString()) - .as("UUID should be equal") - .isEqualTo(expected.toString()); - break; - case FIXED: - assertThat(actual) - .as("Should expect byte[]") - .isInstanceOf(byte[].class) - .isEqualTo(expected); - break; - default: - throw new IllegalArgumentException("Not a supported type: " + type); - } - } - - private static void assertArrayValues( - Type type, LogicalType logicalType, Collection expectedArray, ArrayData actualArray) { - List expectedElements = Lists.newArrayList(expectedArray); - for (int i = 0; i < expectedArray.size(); i += 1) { - if (expectedElements.get(i) == null) { - assertThat(actualArray.isNullAt(i)).isTrue(); - continue; - } - - Object expected = expectedElements.get(i); - - assertEquals( - type, - logicalType, - expected, - ArrayData.createElementGetter(logicalType).getElementOrNull(actualArray, i)); - } - } - - private static void assertMapValues( - Types.MapType mapType, LogicalType type, Map expected, MapData actual) { - assertThat(actual.size()).as("map size should be equal").isEqualTo(expected.size()); - - ArrayData actualKeyArrayData = actual.keyArray(); - ArrayData actualValueArrayData = actual.valueArray(); - LogicalType actualKeyType = ((MapType) type).getKeyType(); - LogicalType actualValueType = ((MapType) type).getValueType(); - Type keyType = mapType.keyType(); - Type valueType = mapType.valueType(); - - ArrayData.ElementGetter keyGetter = ArrayData.createElementGetter(actualKeyType); - ArrayData.ElementGetter valueGetter = ArrayData.createElementGetter(actualValueType); - - for (Map.Entry entry : expected.entrySet()) { - Object matchedActualKey = null; - int matchedKeyIndex = 0; - for (int i = 0; i < actual.size(); i += 1) { - try { - Object key = keyGetter.getElementOrNull(actualKeyArrayData, i); - assertEquals(keyType, actualKeyType, entry.getKey(), key); - matchedActualKey = key; - matchedKeyIndex = i; - break; - } catch (AssertionError e) { - // not found - } - } - assertThat(matchedActualKey).as("Should have a matching key").isNotNull(); - final int valueIndex = matchedKeyIndex; - assertEquals( - valueType, - actualValueType, - entry.getValue(), - valueGetter.getElementOrNull(actualValueArrayData, valueIndex)); - } - } - - public static void assertEquals(ManifestFile expected, ManifestFile actual) { - if (expected == actual) { - return; - } - assertThat(expected).isNotNull(); - assertThat(actual).isNotNull(); - assertThat(actual.path()).as("Path must match").isEqualTo(expected.path()); - assertThat(actual.length()).as("Length must match").isEqualTo(expected.length()); - assertThat(actual.partitionSpecId()) - .as("Spec id must match") - .isEqualTo(expected.partitionSpecId()); - assertThat(actual.content()).as("ManifestContent must match").isEqualTo(expected.content()); - assertThat(actual.sequenceNumber()) - .as("SequenceNumber must match") - .isEqualTo(expected.sequenceNumber()); - assertThat(actual.minSequenceNumber()) - .as("MinSequenceNumber must match") - .isEqualTo(expected.minSequenceNumber()); - assertThat(actual.snapshotId()).as("Snapshot id must match").isEqualTo(expected.snapshotId()); - assertThat(actual.hasAddedFiles()) - .as("Added files flag must match") - .isEqualTo(expected.hasAddedFiles()); - assertThat(actual.addedFilesCount()) - .as("Added files count must match") - .isEqualTo(expected.addedFilesCount()); - assertThat(actual.addedRowsCount()) - .as("Added rows count must match") - .isEqualTo(expected.addedRowsCount()); - assertThat(actual.hasExistingFiles()) - .as("Existing files flag must match") - .isEqualTo(expected.hasExistingFiles()); - assertThat(actual.existingFilesCount()) - .as("Existing files count must match") - .isEqualTo(expected.existingFilesCount()); - assertThat(actual.existingRowsCount()) - .as("Existing rows count must match") - .isEqualTo(expected.existingRowsCount()); - assertThat(actual.hasDeletedFiles()) - .as("Deleted files flag must match") - .isEqualTo(expected.hasDeletedFiles()); - assertThat(actual.deletedFilesCount()) - .as("Deleted files count must match") - .isEqualTo(expected.deletedFilesCount()); - assertThat(actual.deletedRowsCount()) - .as("Deleted rows count must match") - .isEqualTo(expected.deletedRowsCount()); - - List expectedSummaries = expected.partitions(); - List actualSummaries = actual.partitions(); - assertThat(actualSummaries) - .as("PartitionFieldSummary size does not match") - .hasSameSizeAs(expectedSummaries); - for (int i = 0; i < expectedSummaries.size(); i++) { - assertThat(actualSummaries.get(i).containsNull()) - .as("Null flag in partition must match") - .isEqualTo(expectedSummaries.get(i).containsNull()); - assertThat(actualSummaries.get(i).containsNaN()) - .as("NaN flag in partition must match") - .isEqualTo(expectedSummaries.get(i).containsNaN()); - assertThat(actualSummaries.get(i).lowerBound()) - .as("Lower bounds in partition must match") - .isEqualTo(expectedSummaries.get(i).lowerBound()); - assertThat(actualSummaries.get(i).upperBound()) - .as("Upper bounds in partition must match") - .isEqualTo(expectedSummaries.get(i).upperBound()); - } - } - - public static void assertEquals(ContentFile expected, ContentFile actual) { - if (expected == actual) { - return; - } - assertThat(expected).isNotNull(); - assertThat(actual).isNotNull(); - assertThat(actual.specId()).as("SpecId").isEqualTo(expected.specId()); - assertThat(actual.content()).as("Content").isEqualTo(expected.content()); - assertThat(actual.location()).as("Location").isEqualTo(expected.location()); - assertThat(actual.format()).as("Format").isEqualTo(expected.format()); - assertThat(actual.partition().size()) - .as("Partition size") - .isEqualTo(expected.partition().size()); - for (int i = 0; i < expected.partition().size(); i++) { - assertThat(actual.partition().get(i, Object.class)) - .as("Partition data at index " + i) - .isEqualTo(expected.partition().get(i, Object.class)); - } - assertThat(actual.recordCount()).as("Record count").isEqualTo(expected.recordCount()); - assertThat(actual.fileSizeInBytes()) - .as("File size in bytes") - .isEqualTo(expected.fileSizeInBytes()); - assertThat(actual.columnSizes()).as("Column sizes").isEqualTo(expected.columnSizes()); - assertThat(actual.valueCounts()).as("Value counts").isEqualTo(expected.valueCounts()); - assertThat(actual.nullValueCounts()) - .as("Null value counts") - .isEqualTo(expected.nullValueCounts()); - assertThat(actual.lowerBounds()).as("Lower bounds").isEqualTo(expected.lowerBounds()); - assertThat(actual.upperBounds()).as("Upper bounds").isEqualTo(expected.upperBounds()); - assertThat(actual.keyMetadata()).as("Key metadata").isEqualTo(expected.keyMetadata()); - assertThat(actual.splitOffsets()).as("Split offsets").isEqualTo(expected.splitOffsets()); - assertThat(actual.equalityFieldIds()) - .as("Equality field id list") - .isEqualTo(expected.equalityFieldIds()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java deleted file mode 100644 index 47f5485df879..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java +++ /dev/null @@ -1,331 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.nio.file.Files; -import java.util.Map; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.table.catalog.Catalog; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.types.Row; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.thrift.TException; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestIcebergConnector extends TestBase { - - private static final String TABLE_NAME = "test_table"; - - @Parameter(index = 0) - private String catalogName; - - @Parameter(index = 1) - private Map properties; - - @Parameter(index = 2) - private boolean isStreaming; - - private volatile TableEnvironment tEnv; - - @Parameters(name = "catalogName = {0}, properties = {1}, isStreaming = {2}") - public static Iterable parameters() { - return Lists.newArrayList( - // Create iceberg table in the hadoop catalog and default database. - new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop"), - true - }, - new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-table", "not_existing_table"), - true - }, - new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop"), - false - }, - // Create iceberg table in the hadoop catalog and not_existing_db. - new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-database", "not_existing_db"), - true - }, - new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-database", "not_existing_db", - "catalog-table", "not_existing_table"), - true - }, - new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-database", "not_existing_db"), - false - }, - // Create iceberg table in the hive catalog and default database. - new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive"), - true - }, - new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-table", "not_existing_table"), - true - }, - new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive"), - false - }, - // Create iceberg table in the hive catalog and not_existing_db. - new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-database", "not_existing_db"), - true - }, - new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-database", "not_existing_db", - "catalog-table", "not_existing_table"), - true - }, - new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-database", "not_existing_db"), - false - }); - } - - @Override - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); - if (isStreaming) { - settingsBuilder.inStreamingMode(); - StreamExecutionEnvironment env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); - env.enableCheckpointing(400); - env.setMaxParallelism(2); - env.setParallelism(2); - tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); - } else { - settingsBuilder.inBatchMode(); - tEnv = TableEnvironment.create(settingsBuilder.build()); - } - // Set only one parallelism. - tEnv.getConfig() - .getConfiguration() - .set(CoreOptions.DEFAULT_PARALLELISM, 1) - .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); - } - } - } - return tEnv; - } - - @AfterEach - public void after() throws TException { - sql("DROP TABLE IF EXISTS %s", TABLE_NAME); - - // Clean the created orphan databases and tables from hive-metastore. - if (isHiveCatalog()) { - HiveMetaStoreClient metaStoreClient = new HiveMetaStoreClient(hiveConf); - try { - metaStoreClient.dropTable(databaseName(), tableName()); - if (!isDefaultDatabaseName()) { - try { - metaStoreClient.dropDatabase(databaseName()); - } catch (Exception ignored) { - // Ignore - } - } - } finally { - metaStoreClient.close(); - } - } - } - - private void testCreateConnectorTable() { - Map tableProps = createTableProps(); - - // Create table under the flink's current database. - sql("CREATE TABLE %s (id BIGINT, data STRING) WITH %s", TABLE_NAME, toWithClause(tableProps)); - sql("INSERT INTO %s VALUES (1, 'AAA'), (2, 'BBB'), (3, 'CCC')", TABLE_NAME); - assertThat(sql("SELECT * FROM %s", TABLE_NAME)) - .containsExactlyInAnyOrder(Row.of(1L, "AAA"), Row.of(2L, "BBB"), Row.of(3L, "CCC")); - - FlinkCatalogFactory factory = new FlinkCatalogFactory(); - Catalog flinkCatalog = factory.createCatalog(catalogName, tableProps, new Configuration()); - assertThat(flinkCatalog.databaseExists(databaseName())).isTrue(); - assertThat(flinkCatalog.tableExists(new ObjectPath(databaseName(), tableName()))).isTrue(); - - // Drop and create it again. - sql("DROP TABLE %s", TABLE_NAME); - sql("CREATE TABLE %s (id BIGINT, data STRING) WITH %s", TABLE_NAME, toWithClause(tableProps)); - assertThat(sql("SELECT * FROM %s", TABLE_NAME)) - .containsExactlyInAnyOrder(Row.of(1L, "AAA"), Row.of(2L, "BBB"), Row.of(3L, "CCC")); - } - - @TestTemplate - public void testCreateTableUnderDefaultDatabase() { - testCreateConnectorTable(); - } - - @TestTemplate - public void testCatalogDatabaseConflictWithFlinkDatabase() { - sql("CREATE DATABASE IF NOT EXISTS `%s`", databaseName()); - sql("USE `%s`", databaseName()); - testCreateConnectorTable(); - // Ensure that the table was created under the specific database. - assertThatThrownBy( - () -> sql("CREATE TABLE `default_catalog`.`%s`.`%s`", databaseName(), TABLE_NAME)) - .isInstanceOf(org.apache.flink.table.api.TableException.class) - .hasMessageStartingWith("Could not execute CreateTable in path"); - } - - @TestTemplate - public void testConnectorTableInIcebergCatalog() { - // Create the catalog properties - Map catalogProps = Maps.newHashMap(); - catalogProps.put("type", "iceberg"); - if (isHiveCatalog()) { - catalogProps.put("catalog-type", "hive"); - catalogProps.put(CatalogProperties.URI, CatalogTestBase.getURI(hiveConf)); - } else { - catalogProps.put("catalog-type", "hadoop"); - } - catalogProps.put(CatalogProperties.WAREHOUSE_LOCATION, createWarehouse()); - - // Create the table properties - Map tableProps = createTableProps(); - - // Create a connector table in an iceberg catalog. - sql("CREATE CATALOG `test_catalog` WITH %s", toWithClause(catalogProps)); - try { - assertThatThrownBy( - () -> - sql( - "CREATE TABLE `test_catalog`.`%s`.`%s` (id BIGINT, data STRING) WITH %s", - FlinkCatalogFactory.DEFAULT_DATABASE_NAME, - TABLE_NAME, - toWithClause(tableProps))) - .cause() - .isInstanceOf(IllegalArgumentException.class) - .hasMessage( - "Cannot create the table with 'connector'='iceberg' table property in an iceberg catalog, " - + "Please create table with 'connector'='iceberg' property in a non-iceberg catalog or " - + "create table without 'connector'='iceberg' related properties in an iceberg table."); - } finally { - sql("DROP CATALOG IF EXISTS `test_catalog`"); - } - } - - private Map createTableProps() { - Map tableProps = Maps.newHashMap(properties); - tableProps.put("catalog-name", catalogName); - tableProps.put(CatalogProperties.WAREHOUSE_LOCATION, createWarehouse()); - if (isHiveCatalog()) { - tableProps.put(CatalogProperties.URI, CatalogTestBase.getURI(hiveConf)); - } - return tableProps; - } - - private boolean isHiveCatalog() { - return "testhive".equalsIgnoreCase(catalogName); - } - - private boolean isDefaultDatabaseName() { - return FlinkCatalogFactory.DEFAULT_DATABASE_NAME.equalsIgnoreCase(databaseName()); - } - - private String tableName() { - return properties.getOrDefault("catalog-table", TABLE_NAME); - } - - private String databaseName() { - return properties.getOrDefault("catalog-database", "default_database"); - } - - private String createWarehouse() { - try { - return String.format( - "file://%s", - Files.createTempDirectory(temporaryDirectory, "junit").toFile().getAbsolutePath()); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java deleted file mode 100644 index 8f1f129e183b..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.file.Path; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.GenericManifestFile; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.ManifestWriter; -import org.apache.iceberg.Metrics; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public class TestManifestFileSerialization { - - private static final Schema SCHEMA = - new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - required(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec SPEC = - PartitionSpec.builderFor(SCHEMA).identity("double").build(); - - private static final DataFile FILE_A = - DataFiles.builder(SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(0) - .withPartition(org.apache.iceberg.TestHelpers.Row.of(1D)) - .withPartitionPath("double=1") - .withMetrics( - new Metrics( - 5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 3L), // value count - ImmutableMap.of(1, 0L, 2, 2L), // null count - ImmutableMap.of(), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(4L)) // upper bounds - )) - .build(); - - private static final DataFile FILE_B = - DataFiles.builder(SPEC) - .withPath("/path/to/data-2.parquet") - .withFileSizeInBytes(0) - .withPartition(org.apache.iceberg.TestHelpers.Row.of(Double.NaN)) - .withPartitionPath("double=NaN") - .withMetrics( - new Metrics( - 1L, - null, // no column sizes - ImmutableMap.of(1, 1L, 4, 1L), // value count - ImmutableMap.of(1, 0L, 2, 0L), // null count - ImmutableMap.of(4, 1L), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(1L)) // upper bounds - )) - .build(); - - private static final FileIO FILE_IO = new HadoopFileIO(new Configuration()); - - @TempDir private Path temp; - - @Test - public void testKryoSerialization() throws IOException { - KryoSerializer kryo = - new KryoSerializer<>(ManifestFile.class, new ExecutionConfig()); - - DataOutputSerializer outputView = new DataOutputSerializer(1024); - - ManifestFile manifest = writeManifest(FILE_A, FILE_B); - - kryo.serialize(manifest, outputView); - kryo.serialize(manifest.copy(), outputView); - kryo.serialize(GenericManifestFile.copyOf(manifest).build(), outputView); - - DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); - ManifestFile m1 = kryo.deserialize(inputView); - ManifestFile m2 = kryo.deserialize(inputView); - ManifestFile m3 = kryo.deserialize(inputView); - - TestHelpers.assertEquals(manifest, m1); - TestHelpers.assertEquals(manifest, m2); - TestHelpers.assertEquals(manifest, m3); - } - - @Test - public void testJavaSerialization() throws Exception { - ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - - ManifestFile manifest = writeManifest(FILE_A, FILE_B); - - try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { - out.writeObject(manifest); - out.writeObject(manifest.copy()); - out.writeObject(GenericManifestFile.copyOf(manifest).build()); - } - - try (ObjectInputStream in = - new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { - for (int i = 0; i < 3; i += 1) { - Object obj = in.readObject(); - assertThat(obj).as("Should be a ManifestFile").isInstanceOf(ManifestFile.class); - TestHelpers.assertEquals(manifest, (ManifestFile) obj); - } - } - } - - private ManifestFile writeManifest(DataFile... files) throws IOException { - File manifestFile = File.createTempFile("input", "m0.avro", temp.toFile()); - assertThat(manifestFile.delete()).isTrue(); - OutputFile outputFile = FILE_IO.newOutputFile(manifestFile.getCanonicalPath()); - - ManifestWriter writer = ManifestFiles.write(SPEC, outputFile); - try { - for (DataFile file : files) { - writer.add(file); - } - } finally { - writer.close(); - } - - return writer.toManifestFile(); - } - - private static ByteBuffer longToBuffer(long value) { - return ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN).putLong(0, value); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java deleted file mode 100644 index 0e7635a33e87..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.Iterator; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.RecordWrapperTestBase; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.data.InternalRecordWrapper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.data.RandomRowData; -import org.apache.iceberg.util.StructLikeWrapper; - -public class TestRowDataWrapper extends RecordWrapperTestBase { - - /** - * Flink's time type has been truncated to millis seconds, so we need a customized assert method - * to check the values. - */ - @Override - public void testTime() { - generateAndValidate( - new Schema(TIME.fields()), - (message, expectedWrapper, actualWrapper) -> { - for (int pos = 0; pos < TIME.fields().size(); pos++) { - Object expected = expectedWrapper.get().get(pos, Object.class); - Object actual = actualWrapper.get().get(pos, Object.class); - if (expected == actual) { - return; - } - - assertThat(actual).isNotNull(); - assertThat(expected).isNotNull(); - - int expectedMilliseconds = (int) ((long) expected / 1000_000); - int actualMilliseconds = (int) ((long) actual / 1000_000); - assertThat(actualMilliseconds).as(message).isEqualTo(expectedMilliseconds); - } - }); - } - - @Override - protected void generateAndValidate( - Schema schema, RecordWrapperTestBase.AssertMethod assertMethod) { - int numRecords = 100; - Iterable recordList = RandomGenericData.generate(schema, numRecords, 101L); - Iterable rowDataList = RandomRowData.generate(schema, numRecords, 101L); - - InternalRecordWrapper recordWrapper = new InternalRecordWrapper(schema.asStruct()); - RowDataWrapper rowDataWrapper = - new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); - - Iterator actual = recordList.iterator(); - Iterator expected = rowDataList.iterator(); - - StructLikeWrapper actualWrapper = StructLikeWrapper.forType(schema.asStruct()); - StructLikeWrapper expectedWrapper = StructLikeWrapper.forType(schema.asStruct()); - for (int i = 0; i < numRecords; i++) { - assertThat(actual).hasNext(); - assertThat(expected).hasNext(); - - StructLike recordStructLike = recordWrapper.wrap(actual.next()); - StructLike rowDataStructLike = rowDataWrapper.wrap(expected.next()); - - assertMethod.assertEquals( - "Should have expected StructLike values", - expectedWrapper.set(rowDataStructLike), - actualWrapper.set(recordStructLike)); - } - - assertThat(actual).isExhausted(); - assertThat(expected).isExhausted(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java deleted file mode 100644 index a7c58e551112..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.File; -import org.apache.iceberg.Table; -import org.apache.iceberg.TestTables; - -public class TestTableLoader implements TableLoader { - private final File dir; - - public static TableLoader of(String dir) { - return new TestTableLoader(dir); - } - - public TestTableLoader(String dir) { - this.dir = new File(dir); - } - - @Override - public void open() {} - - @Override - public boolean isOpen() { - return true; - } - - @Override - public Table loadTable() { - return TestTables.load(dir, "test"); - } - - @Override - @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) - public TableLoader clone() { - return new TestTableLoader(dir.getAbsolutePath()); - } - - @Override - public void close() {} -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java deleted file mode 100644 index 7f0e7acaa822..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.flink.TestHelpers.roundTripKryoSerialize; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; -import java.util.Map; -import org.apache.iceberg.HasTableOperations; -import org.apache.iceberg.MetadataTableType; -import org.apache.iceberg.MetadataTableUtils; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.TestHelpers; -import org.apache.iceberg.Transaction; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public class TestTableSerialization { - private static final HadoopTables TABLES = new HadoopTables(); - - private static final Schema SCHEMA = - new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec SPEC = - PartitionSpec.builderFor(SCHEMA).identity("date").build(); - - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); - - @TempDir private Path temp; - private Table table; - - @BeforeEach - public void initTable() throws IOException { - Map props = ImmutableMap.of("k1", "v1", "k2", "v2"); - - File tableLocation = File.createTempFile("junit", null, temp.toFile()); - assertThat(tableLocation.delete()).isTrue(); - - this.table = TABLES.create(SCHEMA, SPEC, SORT_ORDER, props, tableLocation.toString()); - } - - @Test - public void testSerializableTableKryoSerialization() throws IOException { - SerializableTable serializableTable = (SerializableTable) SerializableTable.copyOf(table); - TestHelpers.assertSerializedAndLoadedMetadata( - table, roundTripKryoSerialize(SerializableTable.class, serializableTable)); - } - - @Test - public void testSerializableMetadataTableKryoSerialization() throws IOException { - for (MetadataTableType type : MetadataTableType.values()) { - TableOperations ops = ((HasTableOperations) table).operations(); - Table metadataTable = - MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); - SerializableTable serializableMetadataTable = - (SerializableTable) SerializableTable.copyOf(metadataTable); - - TestHelpers.assertSerializedAndLoadedMetadata( - metadataTable, - roundTripKryoSerialize(SerializableTable.class, serializableMetadataTable)); - } - } - - @Test - public void testSerializableTransactionTableKryoSerialization() throws IOException { - Transaction txn = table.newTransaction(); - - txn.updateProperties().set("k1", "v1").commit(); - - Table txnTable = txn.table(); - SerializableTable serializableTxnTable = (SerializableTable) SerializableTable.copyOf(txnTable); - - TestHelpers.assertSerializedMetadata( - txnTable, roundTripKryoSerialize(SerializableTable.class, serializableTxnTable)); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java deleted file mode 100644 index b9c8ebbf179b..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java +++ /dev/null @@ -1,523 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.actions; - -import static org.apache.iceberg.flink.SimpleDataUtil.RECORD; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; -import java.util.Collection; -import java.util.List; -import java.util.Set; -import java.util.UUID; -import java.util.stream.Collectors; -import org.apache.commons.lang3.StringUtils; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.types.Row; -import org.apache.iceberg.ContentFile; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileContent; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.Files; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.TestHelpers; -import org.apache.iceberg.actions.RewriteDataFilesActionResult; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.CatalogTestBase; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.Pair; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.io.TempDir; - -public class TestRewriteDataFilesAction extends CatalogTestBase { - - private static final String TABLE_NAME_UNPARTITIONED = "test_table_unpartitioned"; - private static final String TABLE_NAME_PARTITIONED = "test_table_partitioned"; - private static final String TABLE_NAME_WITH_PK = "test_table_with_pk"; - - @Parameter(index = 2) - private FileFormat format; - - @Parameter(index = 3) - private int formatVersion; - - private Table icebergTableUnPartitioned; - private Table icebergTablePartitioned; - private Table icebergTableWithPk; - - @Override - protected TableEnvironment getTableEnv() { - super.getTableEnv().getConfig().getConfiguration().set(CoreOptions.DEFAULT_PARALLELISM, 1); - return super.getTableEnv(); - } - - @Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, formatVersion={3}") - public static List parameters() { - List parameters = Lists.newArrayList(); - for (FileFormat format : - new FileFormat[] {FileFormat.AVRO, FileFormat.ORC, FileFormat.PARQUET}) { - for (Object[] catalogParams : CatalogTestBase.parameters()) { - for (int version : TestHelpers.V2_AND_ABOVE) { - String catalogName = (String) catalogParams[0]; - Namespace baseNamespace = (Namespace) catalogParams[1]; - parameters.add(new Object[] {catalogName, baseNamespace, format, version}); - } - } - } - return parameters; - } - - private @TempDir Path temp; - - @Override - @BeforeEach - public void before() { - super.before(); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - sql( - "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s', '%s'='%s')", - TABLE_NAME_UNPARTITIONED, format.name(), TableProperties.FORMAT_VERSION, formatVersion); - icebergTableUnPartitioned = - validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_UNPARTITIONED)); - - sql( - "CREATE TABLE %s (id int, data varchar,spec varchar) " - + " PARTITIONED BY (data,spec) with ('write.format.default'='%s', '%s'='%s')", - TABLE_NAME_PARTITIONED, format.name(), TableProperties.FORMAT_VERSION, formatVersion); - icebergTablePartitioned = - validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_PARTITIONED)); - - sql( - "CREATE TABLE %s (id int, data varchar, PRIMARY KEY(`id`) NOT ENFORCED) with ('write.format.default'='%s', '%s'='%s')", - TABLE_NAME_WITH_PK, format.name(), TableProperties.FORMAT_VERSION, formatVersion); - icebergTableWithPk = - validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_WITH_PK)); - } - - @Override - @AfterEach - public void clean() { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME_UNPARTITIONED); - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME_PARTITIONED); - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME_WITH_PK); - dropDatabase(flinkDatabase, true); - super.clean(); - } - - @TestTemplate - public void testFailureOnV3Table() { - // Flink does not support compaction on row lineage enabled tables (V3+) - assumeThat(formatVersion).isGreaterThanOrEqualTo(3); - - assertThatThrownBy( - () -> Actions.forTable(icebergTableUnPartitioned).rewriteDataFiles().execute()) - .hasMessageContaining( - "Flink does not support compaction on row lineage enabled tables (V3+)") - .isInstanceOf(IllegalArgumentException.class); - } - - @TestTemplate - public void testRewriteDataFilesEmptyTable() throws Exception { - // Flink does not support compaction on row lineage enabled tables (V3+) - assumeThat(formatVersion).isLessThan(3); - - assertThat(icebergTableUnPartitioned.currentSnapshot()).isNull(); - Actions.forTable(icebergTableUnPartitioned).rewriteDataFiles().execute(); - assertThat(icebergTableUnPartitioned.currentSnapshot()).isNull(); - } - - @TestTemplate - public void testRewriteDataFilesUnpartitionedTable() throws Exception { - // Flink does not support compaction on row lineage enabled tables (V3+) - assumeThat(formatVersion).isLessThan(3); - - sql("INSERT INTO %s SELECT 1, 'hello'", TABLE_NAME_UNPARTITIONED); - sql("INSERT INTO %s SELECT 2, 'world'", TABLE_NAME_UNPARTITIONED); - - icebergTableUnPartitioned.refresh(); - - CloseableIterable tasks = icebergTableUnPartitioned.newScan().planFiles(); - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - assertThat(dataFiles).hasSize(2); - RewriteDataFilesActionResult result = - Actions.forTable(icebergTableUnPartitioned).rewriteDataFiles().execute(); - - assertThat(result.deletedDataFiles()).hasSize(2); - assertThat(result.addedDataFiles()).hasSize(1); - - icebergTableUnPartitioned.refresh(); - - CloseableIterable tasks1 = icebergTableUnPartitioned.newScan().planFiles(); - List dataFiles1 = - Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); - assertThat(dataFiles1).hasSize(1); - // Assert the table records as expected. - SimpleDataUtil.assertTableRecords( - icebergTableUnPartitioned, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hello"), SimpleDataUtil.createRecord(2, "world"))); - } - - @TestTemplate - public void testRewriteDataFilesPartitionedTable() throws Exception { - // Flink does not support compaction on row lineage enabled tables (V3+) - assumeThat(formatVersion).isLessThan(3); - - sql("INSERT INTO %s SELECT 1, 'hello' ,'a'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 2, 'hello' ,'a'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 3, 'world' ,'b'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 4, 'world' ,'b'", TABLE_NAME_PARTITIONED); - - icebergTablePartitioned.refresh(); - - CloseableIterable tasks = icebergTablePartitioned.newScan().planFiles(); - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - assertThat(dataFiles).hasSize(4); - RewriteDataFilesActionResult result = - Actions.forTable(icebergTablePartitioned).rewriteDataFiles().execute(); - - assertThat(result.deletedDataFiles()).hasSize(4); - assertThat(result.addedDataFiles()).hasSize(2); - - icebergTablePartitioned.refresh(); - - CloseableIterable tasks1 = icebergTablePartitioned.newScan().planFiles(); - List dataFiles1 = - Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); - assertThat(dataFiles1).hasSize(2); - // Assert the table records as expected. - Schema schema = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "spec", Types.StringType.get())); - - Record record = GenericRecord.create(schema); - SimpleDataUtil.assertTableRecords( - icebergTablePartitioned, - Lists.newArrayList( - record.copy("id", 1, "data", "hello", "spec", "a"), - record.copy("id", 2, "data", "hello", "spec", "a"), - record.copy("id", 3, "data", "world", "spec", "b"), - record.copy("id", 4, "data", "world", "spec", "b"))); - } - - @TestTemplate - public void testRewriteDataFilesWithFilter() throws Exception { - // Flink does not support compaction on row lineage enabled tables (V3+) - assumeThat(formatVersion).isLessThan(3); - - sql("INSERT INTO %s SELECT 1, 'hello' ,'a'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 2, 'hello' ,'a'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 3, 'world' ,'a'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 4, 'world' ,'b'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 5, 'world' ,'b'", TABLE_NAME_PARTITIONED); - - icebergTablePartitioned.refresh(); - - CloseableIterable tasks = icebergTablePartitioned.newScan().planFiles(); - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - assertThat(dataFiles).hasSize(5); - RewriteDataFilesActionResult result = - Actions.forTable(icebergTablePartitioned) - .rewriteDataFiles() - .filter(Expressions.equal("spec", "a")) - .filter(Expressions.startsWith("data", "he")) - .execute(); - assertThat(result.deletedDataFiles()).hasSize(2); - assertThat(result.addedDataFiles()).hasSize(1); - - icebergTablePartitioned.refresh(); - - CloseableIterable tasks1 = icebergTablePartitioned.newScan().planFiles(); - List dataFiles1 = - Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); - assertThat(dataFiles1).hasSize(4); - // Assert the table records as expected. - Schema schema = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "spec", Types.StringType.get())); - - Record record = GenericRecord.create(schema); - SimpleDataUtil.assertTableRecords( - icebergTablePartitioned, - Lists.newArrayList( - record.copy("id", 1, "data", "hello", "spec", "a"), - record.copy("id", 2, "data", "hello", "spec", "a"), - record.copy("id", 3, "data", "world", "spec", "a"), - record.copy("id", 4, "data", "world", "spec", "b"), - record.copy("id", 5, "data", "world", "spec", "b"))); - } - - @TestTemplate - public void testRewriteLargeTableHasResiduals() throws IOException { - // Flink does not support compaction on row lineage enabled tables (V3+) - assumeThat(formatVersion).isLessThan(3); - - // all records belong to the same partition - List records1 = Lists.newArrayList(); - List records2 = Lists.newArrayList(); - List expected = Lists.newArrayList(); - for (int i = 0; i < 100; i++) { - int id = i; - String data = String.valueOf(i % 3); - if (i % 2 == 0) { - records1.add("(" + id + ",'" + data + "')"); - } else { - records2.add("(" + id + ",'" + data + "')"); - } - Record record = RECORD.copy(); - record.setField("id", id); - record.setField("data", data); - expected.add(record); - } - - sql("INSERT INTO %s values " + StringUtils.join(records1, ","), TABLE_NAME_UNPARTITIONED); - sql("INSERT INTO %s values " + StringUtils.join(records2, ","), TABLE_NAME_UNPARTITIONED); - - icebergTableUnPartitioned.refresh(); - - CloseableIterable tasks = - icebergTableUnPartitioned - .newScan() - .ignoreResiduals() - .filter(Expressions.equal("data", "0")) - .planFiles(); - for (FileScanTask task : tasks) { - assertThat(task.residual()) - .as("Residuals must be ignored") - .isEqualTo(Expressions.alwaysTrue()); - } - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - assertThat(dataFiles).hasSize(2); - Actions actions = Actions.forTable(icebergTableUnPartitioned); - - RewriteDataFilesActionResult result = - actions.rewriteDataFiles().filter(Expressions.equal("data", "0")).execute(); - assertThat(result.deletedDataFiles()).hasSize(2); - assertThat(result.addedDataFiles()).hasSize(1); - // Assert the table records as expected. - SimpleDataUtil.assertTableRecords(icebergTableUnPartitioned, expected); - } - - /** - * a test case to test avoid repeate compress - * - *

    If datafile cannot be combined to CombinedScanTask with other DataFiles, the size of the - * CombinedScanTask list size is 1, so we remove these CombinedScanTasks to avoid compressed - * repeatedly. - * - *

    In this test case,we generated 3 data files and set targetSizeInBytes greater than the - * largest file size so that it cannot be combined a CombinedScanTask with other datafiles. The - * datafile with the largest file size will not be compressed. - * - * @throws IOException IOException - */ - @TestTemplate - public void testRewriteAvoidRepeateCompress() throws IOException { - // Flink does not support compaction on row lineage enabled tables (V3+) - assumeThat(formatVersion).isLessThan(3); - - List expected = Lists.newArrayList(); - Schema schema = icebergTableUnPartitioned.schema(); - GenericAppenderFactory genericAppenderFactory = new GenericAppenderFactory(schema); - File file = File.createTempFile("junit", null, temp.toFile()); - int count = 0; - try (FileAppender fileAppender = - genericAppenderFactory.newAppender(Files.localOutput(file), format)) { - long filesize = 20000; - for (; fileAppender.length() < filesize; count++) { - Record record = SimpleDataUtil.createRecord(count, UUID.randomUUID().toString()); - fileAppender.add(record); - expected.add(record); - } - } - - DataFile dataFile = - DataFiles.builder(icebergTableUnPartitioned.spec()) - .withPath(file.getAbsolutePath()) - .withFileSizeInBytes(file.length()) - .withFormat(format) - .withRecordCount(count) - .build(); - - icebergTableUnPartitioned.newAppend().appendFile(dataFile).commit(); - - sql("INSERT INTO %s SELECT 1,'a' ", TABLE_NAME_UNPARTITIONED); - sql("INSERT INTO %s SELECT 2,'b' ", TABLE_NAME_UNPARTITIONED); - - icebergTableUnPartitioned.refresh(); - - CloseableIterable tasks = icebergTableUnPartitioned.newScan().planFiles(); - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - assertThat(dataFiles).hasSize(3); - Actions actions = Actions.forTable(icebergTableUnPartitioned); - - long targetSizeInBytes = file.length() + 10; - RewriteDataFilesActionResult result = - actions - .rewriteDataFiles() - .targetSizeInBytes(targetSizeInBytes) - .splitOpenFileCost(1) - .execute(); - assertThat(result.deletedDataFiles()).hasSize(2); - assertThat(result.addedDataFiles()).hasSize(1); - icebergTableUnPartitioned.refresh(); - - CloseableIterable tasks1 = icebergTableUnPartitioned.newScan().planFiles(); - List dataFilesRewrote = - Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); - assertThat(dataFilesRewrote).hasSize(2); - // the biggest file do not be rewrote - List rewroteDataFileNames = - dataFilesRewrote.stream().map(ContentFile::location).collect(Collectors.toList()); - assertThat(rewroteDataFileNames).contains(file.getAbsolutePath()); - - // Assert the table records as expected. - expected.add(SimpleDataUtil.createRecord(1, "a")); - expected.add(SimpleDataUtil.createRecord(2, "b")); - SimpleDataUtil.assertTableRecords(icebergTableUnPartitioned, expected); - } - - @TestTemplate - public void testRewriteNoConflictWithEqualityDeletes() throws IOException { - // Flink does not support compaction on row lineage enabled tables (V3+) - assumeThat(formatVersion).isLessThan(3); - - // Add 2 data files - sql("INSERT INTO %s SELECT 1, 'hello'", TABLE_NAME_WITH_PK); - sql("INSERT INTO %s SELECT 2, 'world'", TABLE_NAME_WITH_PK); - - // Load 2 stale tables to pass to rewrite actions - // Since the first rewrite will refresh stale1, we need another stale2 for the second rewrite - Table stale1 = - validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_WITH_PK)); - Table stale2 = - validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_WITH_PK)); - - // Add 1 data file and 1 equality-delete file - sql("INSERT INTO %s /*+ OPTIONS('upsert-enabled'='true')*/ SELECT 1, 'hi'", TABLE_NAME_WITH_PK); - - icebergTableWithPk.refresh(); - assertThat(icebergTableWithPk.currentSnapshot().sequenceNumber()) - .as("The latest sequence number should be greater than that of the stale snapshot") - .isEqualTo(stale1.currentSnapshot().sequenceNumber() + 1); - CloseableIterable tasks = icebergTableWithPk.newScan().planFiles(); - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - Set deleteFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::deletes)).stream() - .flatMap(Collection::stream) - .collect(Collectors.toSet()); - assertThat(dataFiles).hasSize(3); - assertThat(deleteFiles).hasSize(1); - assertThat(Iterables.getOnlyElement(deleteFiles).content()) - .isEqualTo(FileContent.EQUALITY_DELETES); - shouldHaveDataAndFileSequenceNumbers( - TABLE_NAME_WITH_PK, - ImmutableList.of(Pair.of(1L, 1L), Pair.of(2L, 2L), Pair.of(3L, 3L), Pair.of(3L, 3L))); - - assertThatThrownBy( - () -> - Actions.forTable(stale1) - .rewriteDataFiles() - .useStartingSequenceNumber(false) - .execute(), - "Rewrite using new sequence number should fail") - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Cannot commit, found new delete for replaced data file"); - - // Rewrite using the starting sequence number should succeed - RewriteDataFilesActionResult result = - Actions.forTable(stale2).rewriteDataFiles().useStartingSequenceNumber(true).execute(); - - // Should not rewrite files from the new commit - assertThat(result.deletedDataFiles()).hasSize(2); - assertThat(result.addedDataFiles()).hasSize(1); - // The 2 older files with file-sequence-number <= 2 should be rewritten into a new file. - // The new file is the one with file-sequence-number == 4. - // The new file should use rewrite's starting-sequence-number 2 as its data-sequence-number. - shouldHaveDataAndFileSequenceNumbers( - TABLE_NAME_WITH_PK, ImmutableList.of(Pair.of(3L, 3L), Pair.of(3L, 3L), Pair.of(2L, 4L))); - - // Assert the table records as expected. - SimpleDataUtil.assertTableRecords( - icebergTableWithPk, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hi"), SimpleDataUtil.createRecord(2, "world"))); - } - - /** - * Assert that data files and delete files in the table should have expected data sequence numbers - * and file sequence numbers - * - * @param tableName table name - * @param expectedSequenceNumbers list of {@link Pair}'s. Each {@link Pair} contains - * (expectedDataSequenceNumber, expectedFileSequenceNumber) of a file. - */ - private void shouldHaveDataAndFileSequenceNumbers( - String tableName, List> expectedSequenceNumbers) { - // "status < 2" for added or existing entries - List liveEntries = sql("SELECT * FROM %s$entries WHERE status < 2", tableName); - - List> actualSequenceNumbers = - liveEntries.stream() - .map( - row -> - Pair.of( - row.getFieldAs("sequence_number"), row.getFieldAs("file_sequence_number"))) - .collect(Collectors.toList()); - assertThat(actualSequenceNumbers).hasSameElementsAs(expectedSequenceNumbers); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java deleted file mode 100644 index cc58d9817ac6..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.RowDataConverter; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; - -public class RandomRowData { - private RandomRowData() {} - - public static Iterable generate(Schema schema, int numRecords, long seed) { - return convert(schema, RandomGenericData.generate(schema, numRecords, seed)); - } - - public static Iterable convert(Schema schema, Iterable records) { - return Iterables.transform(records, record -> RowDataConverter.convert(schema, record)); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java deleted file mode 100644 index 74b1da6007e6..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import org.apache.flink.api.common.functions.RichMapFunction; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.conversion.DataStructureConverter; -import org.apache.flink.table.data.conversion.DataStructureConverters; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.flink.types.Row; - -public class RowDataToRowMapper extends RichMapFunction { - - private final RowType rowType; - - private transient DataStructureConverter converter; - - public RowDataToRowMapper(RowType rowType) { - this.rowType = rowType; - } - - @Override - public void open(Configuration parameters) throws Exception { - this.converter = - DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(rowType)); - } - - @Override - public Row map(RowData value) throws Exception { - return (Row) converter.toExternal(value); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java deleted file mode 100644 index 45b679eeda73..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.data.DataTestBase; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.data.avro.DataWriter; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.inmemory.InMemoryOutputFile; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -public class TestFlinkAvroReaderWriter extends DataTestBase { - - private static final int NUM_RECORDS = 100; - - @Override - protected boolean supportsDefaultValues() { - return true; - } - - @Override - protected boolean supportsUnknown() { - return true; - } - - @Override - protected boolean supportsTimestampNanos() { - return true; - } - - @Override - protected void writeAndValidate(Schema schema) throws IOException { - List expectedRecords = RandomGenericData.generate(schema, NUM_RECORDS, 1991L); - writeAndValidate(schema, expectedRecords); - } - - @Override - protected void writeAndValidate(Schema schema, List expectedRecords) throws IOException { - writeAndValidate(schema, schema, expectedRecords); - } - - @Override - protected void writeAndValidate(Schema writeSchema, Schema expectedSchema) throws IOException { - List expectedRecords = RandomGenericData.generate(writeSchema, NUM_RECORDS, 1991L); - writeAndValidate(writeSchema, expectedSchema, expectedRecords); - } - - protected void writeAndValidate( - Schema writeSchema, Schema expectedSchema, List expectedRecords) throws IOException { - List expectedRows = - Lists.newArrayList(RandomRowData.convert(writeSchema, expectedRecords)); - - OutputFile outputFile = new InMemoryOutputFile(); - - // Write the expected records into AVRO file, then read them into RowData and assert with the - // expected Record list. - try (FileAppender writer = - Avro.write(outputFile).schema(writeSchema).createWriterFunc(DataWriter::create).build()) { - writer.addAll(expectedRecords); - } - - RowType flinkSchema = FlinkSchemaUtil.convert(expectedSchema); - - try (CloseableIterable reader = - Avro.read(outputFile.toInputFile()) - .project(expectedSchema) - .createResolvingReader(FlinkPlannedAvroReader::create) - .build()) { - Iterator expected = expectedRecords.iterator(); - Iterator rows = reader.iterator(); - for (int i = 0; i < expectedRecords.size(); i++) { - assertThat(rows).hasNext(); - TestHelpers.assertRowData( - expectedSchema.asStruct(), flinkSchema, expected.next(), rows.next()); - } - assertThat(rows).isExhausted(); - } - - OutputFile file = new InMemoryOutputFile(); - - // Write the expected RowData into AVRO file, then read them into Record and assert with the - // expected RowData list. - try (FileAppender writer = - Avro.write(file) - .schema(writeSchema) - .createWriterFunc(ignore -> new FlinkAvroWriter(flinkSchema)) - .build()) { - writer.addAll(expectedRows); - } - - try (CloseableIterable reader = - Avro.read(file.toInputFile()) - .project(expectedSchema) - .createResolvingReader(FlinkPlannedAvroReader::create) - .build()) { - Iterator expected = expectedRecords.iterator(); - Iterator rows = reader.iterator(); - for (int i = 0; i < expectedRecords.size(); i += 1) { - assertThat(rows).hasNext(); - TestHelpers.assertRowData( - expectedSchema.asStruct(), flinkSchema, expected.next(), rows.next()); - } - assertThat(rows).isExhausted(); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java deleted file mode 100644 index 4a70802f2a2e..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.DataTestBase; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.data.orc.GenericOrcReader; -import org.apache.iceberg.data.orc.GenericOrcWriter; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -public class TestFlinkOrcReaderWriter extends DataTestBase { - private static final int NUM_RECORDS = 100; - - /** Orc writers don't have notion of non-null / required fields. */ - @Override - protected boolean allowsWritingNullValuesForRequiredFields() { - return true; - } - - @Override - protected void writeAndValidate(Schema schema) throws IOException { - List expectedRecords = RandomGenericData.generate(schema, NUM_RECORDS, 1990L); - writeAndValidate(schema, expectedRecords); - } - - @Override - protected void writeAndValidate(Schema schema, List expectedRecords) throws IOException { - RowType flinkSchema = FlinkSchemaUtil.convert(schema); - List expectedRows = Lists.newArrayList(RandomRowData.convert(schema, expectedRecords)); - - File recordsFile = File.createTempFile("junit", null, temp.toFile()); - assertThat(recordsFile.delete()).isTrue(); - - // Write the expected records into ORC file, then read them into RowData and assert with the - // expected Record list. - try (FileAppender writer = - ORC.write(Files.localOutput(recordsFile)) - .schema(schema) - .createWriterFunc(GenericOrcWriter::buildWriter) - .build()) { - writer.addAll(expectedRecords); - } - - try (CloseableIterable reader = - ORC.read(Files.localInput(recordsFile)) - .project(schema) - .createReaderFunc(type -> new FlinkOrcReader(schema, type)) - .build()) { - Iterator expected = expectedRecords.iterator(); - Iterator rows = reader.iterator(); - for (int i = 0; i < expectedRecords.size(); i++) { - assertThat(rows).hasNext(); - TestHelpers.assertRowData(schema.asStruct(), flinkSchema, expected.next(), rows.next()); - } - assertThat(rows).isExhausted(); - } - - File rowDataFile = File.createTempFile("junit", null, temp.toFile()); - assertThat(rowDataFile.delete()).isTrue(); - - // Write the expected RowData into ORC file, then read them into Record and assert with the - // expected RowData list. - RowType rowType = FlinkSchemaUtil.convert(schema); - try (FileAppender writer = - ORC.write(Files.localOutput(rowDataFile)) - .schema(schema) - .createWriterFunc((iSchema, typeDesc) -> FlinkOrcWriter.buildWriter(rowType, iSchema)) - .build()) { - writer.addAll(expectedRows); - } - - try (CloseableIterable reader = - ORC.read(Files.localInput(rowDataFile)) - .project(schema) - .createReaderFunc(type -> GenericOrcReader.buildReader(schema, type)) - .build()) { - Iterator expected = expectedRows.iterator(); - Iterator records = reader.iterator(); - for (int i = 0; i < expectedRecords.size(); i += 1) { - assertThat(records.hasNext()).isTrue(); - TestHelpers.assertRowData(schema.asStruct(), flinkSchema, records.next(), expected.next()); - } - assertThat(records).isExhausted(); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java deleted file mode 100644 index e6781356f711..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java +++ /dev/null @@ -1,268 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.parquet.schema.Types.primitive; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.Iterator; -import java.util.List; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.GenericRecordBuilder; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.avro.AvroSchemaUtil; -import org.apache.iceberg.data.DataTestBase; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.data.parquet.GenericParquetWriter; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.parquet.ParquetValueReader; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.parquet.avro.AvroParquetWriter; -import org.apache.parquet.hadoop.ParquetWriter; -import org.apache.parquet.schema.LogicalTypeAnnotation; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; -import org.junit.jupiter.api.Test; - -public class TestFlinkParquetReader extends DataTestBase { - private static final int NUM_RECORDS = 100; - - @Override - protected boolean supportsDefaultValues() { - return true; - } - - @Override - protected boolean supportsUnknown() { - return true; - } - - @Override - protected boolean supportsTimestampNanos() { - return true; - } - - @Test - public void testBuildReader() { - MessageType fileSchema = - new MessageType( - "test", - // 0: required(100, "id", LongType.get()) - primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) - .id(100) - .named("id"), - // 1: optional(101, "data", Types.StringType.get()) - primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.OPTIONAL) - .id(101) - .named("data"), - // 2: required(102, "b", Types.BooleanType.get()) - primitive(PrimitiveType.PrimitiveTypeName.BOOLEAN, Type.Repetition.REQUIRED) - .id(102) - .named("b"), - // 3: optional(103, "i", Types.IntegerType.get()) - primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.OPTIONAL) - .id(103) - .named("i"), - // 4: optional(105, "f", Types.FloatType.get()) - primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) - .id(104) - .named("l"), - // 5: required(106, "d", Types.DoubleType.get()) - primitive(PrimitiveType.PrimitiveTypeName.FLOAT, Type.Repetition.OPTIONAL) - .id(105) - .named("f"), - // 6: required(106, "d", Types.DoubleType.get()) - primitive(PrimitiveType.PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) - .id(106) - .named("d"), - // 7: optional(107, "date", Types.DateType.get()) - primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.OPTIONAL) - .id(107) - .as(LogicalTypeAnnotation.dateType()) - .named("date"), - // 8: required(108, "ts_tz", Types.TimestampType.withZone()) - primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) - .id(108) - .as( - LogicalTypeAnnotation.timestampType( - true, LogicalTypeAnnotation.TimeUnit.MICROS)) - .named("ts_tz"), - // 9: required(109, "ts", Types.TimestampType.withoutZone()) - primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) - .id(109) - .as( - LogicalTypeAnnotation.timestampType( - false, LogicalTypeAnnotation.TimeUnit.MICROS)) - .named("ts"), - // 10: required(110, "s", Types.StringType.get()) - primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) - .id(110) - .as(LogicalTypeAnnotation.stringType()) - .named("s"), - // 11: required(112, "fixed", Types.FixedType.ofLength(7)) - primitive( - PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, Type.Repetition.REQUIRED) - .id(112) - .length(7) - .named("f"), - // 12: optional(113, "bytes", Types.BinaryType.get()) - primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.OPTIONAL) - .id(113) - .named("bytes"), - // 13: required(114, "dec_9_0", Types.DecimalType.of(9, 0)) - primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) - .id(114) - .as(LogicalTypeAnnotation.decimalType(0, 9)) - .named("dec_9_0"), - // 14: required(115, "dec_11_2", Types.DecimalType.of(11, 2)) - primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) - .id(115) - .as(LogicalTypeAnnotation.decimalType(2, 11)) - .named("dec_11_2"), - // 15: required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision - primitive( - PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, Type.Repetition.REQUIRED) - .id(116) - .length(16) - .as(LogicalTypeAnnotation.decimalType(10, 38)) - .named("dec_38_10"), - // 16: required(117, "time", Types.TimeType.get()) - primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.OPTIONAL) - .id(117) - .as(LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MICROS)) - .named("time")); - ParquetValueReader reader = - FlinkParquetReaders.buildReader(new Schema(SUPPORTED_PRIMITIVES.fields()), fileSchema); - - assertThat(reader.columns()).hasSameSizeAs(SUPPORTED_PRIMITIVES.fields()); - } - - @Test - public void testTwoLevelList() throws IOException { - Schema schema = - new Schema( - optional(1, "arraybytes", Types.ListType.ofRequired(3, Types.BinaryType.get())), - optional(2, "topbytes", Types.BinaryType.get())); - org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(schema.asStruct()); - - File testFile = File.createTempFile("junit", null, temp.toFile()); - assertThat(testFile.delete()).isTrue(); - - ParquetWriter writer = - AvroParquetWriter.builder(new Path(testFile.toURI())) - .withDataModel(GenericData.get()) - .withSchema(avroSchema) - .config("parquet.avro.add-list-element-records", "true") - .config("parquet.avro.write-old-list-structure", "true") - .build(); - - GenericRecordBuilder recordBuilder = new GenericRecordBuilder(avroSchema); - List expectedByteList = Lists.newArrayList(); - byte[] expectedByte = {0x00, 0x01}; - ByteBuffer expectedBinary = ByteBuffer.wrap(expectedByte); - expectedByteList.add(expectedBinary); - recordBuilder.set("arraybytes", expectedByteList); - recordBuilder.set("topbytes", expectedBinary); - GenericData.Record expectedRecord = recordBuilder.build(); - - writer.write(expectedRecord); - writer.close(); - - try (CloseableIterable reader = - Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(type -> FlinkParquetReaders.buildReader(schema, type)) - .build()) { - Iterator rows = reader.iterator(); - assertThat(rows).hasNext(); - RowData rowData = rows.next(); - assertThat(rowData.getArray(0).getBinary(0)).isEqualTo(expectedByte); - assertThat(rowData.getBinary(1)).isEqualTo(expectedByte); - assertThat(rows).isExhausted(); - } - } - - private void writeAndValidate( - Iterable iterable, Schema writeSchema, Schema expectedSchema) throws IOException { - File testFile = File.createTempFile("junit", null, temp.toFile()); - assertThat(testFile.delete()).isTrue(); - - try (FileAppender writer = - Parquet.write(Files.localOutput(testFile)) - .schema(writeSchema) - .createWriterFunc(GenericParquetWriter::create) - .build()) { - writer.addAll(iterable); - } - - try (CloseableIterable reader = - Parquet.read(Files.localInput(testFile)) - .project(expectedSchema) - .createReaderFunc(type -> FlinkParquetReaders.buildReader(expectedSchema, type)) - .build()) { - Iterator expected = iterable.iterator(); - Iterator rows = reader.iterator(); - LogicalType rowType = FlinkSchemaUtil.convert(writeSchema); - for (int i = 0; i < NUM_RECORDS; i += 1) { - assertThat(rows).hasNext(); - TestHelpers.assertRowData(writeSchema.asStruct(), rowType, expected.next(), rows.next()); - } - assertThat(rows).isExhausted(); - } - } - - @Override - protected void writeAndValidate(Schema schema) throws IOException { - writeAndValidate(RandomGenericData.generate(schema, NUM_RECORDS, 19981), schema, schema); - writeAndValidate( - RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124), - schema, - schema); - writeAndValidate( - RandomGenericData.generateFallbackRecords(schema, NUM_RECORDS, 21124, NUM_RECORDS / 20), - schema, - schema); - } - - @Override - protected void writeAndValidate(Schema writeSchema, Schema expectedSchema) throws IOException { - writeAndValidate(RandomGenericData.generate(writeSchema, 100, 0L), writeSchema, expectedSchema); - } - - @Override - protected void writeAndValidate(Schema schema, List expectedData) throws IOException { - writeAndValidate(expectedData, schema, schema); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java deleted file mode 100644 index d181d3351410..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.Iterator; -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.binary.BinaryRowData; -import org.apache.flink.table.runtime.typeutils.RowDataSerializer; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.DataTestBase; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.data.parquet.GenericParquetReaders; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataConverter; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.inmemory.InMemoryOutputFile; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.io.TempDir; - -public class TestFlinkParquetWriter extends DataTestBase { - private static final int NUM_RECORDS = 100; - - @TempDir private Path temp; - - @Override - protected boolean supportsUnknown() { - return true; - } - - @Override - protected boolean supportsTimestampNanos() { - return true; - } - - private void writeAndValidate(Iterable iterable, Schema schema) throws IOException { - OutputFile outputFile = new InMemoryOutputFile(); - - LogicalType logicalType = FlinkSchemaUtil.convert(schema); - - try (FileAppender writer = - Parquet.write(outputFile) - .schema(schema) - .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(logicalType, msgType)) - .build()) { - writer.addAll(iterable); - } - - try (CloseableIterable reader = - Parquet.read(outputFile.toInputFile()) - .project(schema) - .createReaderFunc(msgType -> GenericParquetReaders.buildReader(schema, msgType)) - .build()) { - Iterator expected = iterable.iterator(); - Iterator actual = reader.iterator(); - LogicalType rowType = FlinkSchemaUtil.convert(schema); - for (int i = 0; i < NUM_RECORDS; i += 1) { - assertThat(actual).hasNext(); - TestHelpers.assertRowData(schema.asStruct(), rowType, actual.next(), expected.next()); - } - assertThat(actual).isExhausted(); - } - } - - @Override - protected void writeAndValidate(Schema schema) throws IOException { - writeAndValidate(RandomRowData.generate(schema, NUM_RECORDS, 19981), schema); - - writeAndValidate( - RandomRowData.convert( - schema, - RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124)), - schema); - - writeAndValidate( - RandomRowData.convert( - schema, - RandomGenericData.generateFallbackRecords( - schema, NUM_RECORDS, 21124, NUM_RECORDS / 20)), - schema); - } - - @Override - protected void writeAndValidate(Schema schema, List expectedData) throws IOException { - RowDataSerializer rowDataSerializer = new RowDataSerializer(FlinkSchemaUtil.convert(schema)); - List binaryRowList = Lists.newArrayList(); - for (Record record : expectedData) { - RowData rowData = RowDataConverter.convert(schema, record); - BinaryRowData binaryRow = rowDataSerializer.toBinaryRow(rowData); - binaryRowList.add(binaryRow); - } - - writeAndValidate(binaryRowList, schema); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java deleted file mode 100644 index 4e5b38ffb026..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java +++ /dev/null @@ -1,593 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatNoException; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.util.List; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.DataGenerator; -import org.apache.iceberg.flink.DataGenerators; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.StructProjection; -import org.junit.jupiter.api.Test; - -public class TestRowDataProjection { - @Test - public void testNullRootRowData() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowDataProjection projection = RowDataProjection.create(schema, schema.select("id")); - - assertThatThrownBy(() -> projection.wrap(null)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Invalid row data: null"); - } - - @Test - public void testFullProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - generateAndValidate(schema, schema); - - GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); - testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); - } - - @Test - public void testReorderedFullProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - Schema reordered = - new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get())); - - generateAndValidate(schema, reordered); - - GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); - testEqualsAndHashCode(schema, reordered, rowData, copyRowData, otherRowData); - } - - @Test - public void testBasicProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); - generateAndValidate(schema, idOnly); - generateAndValidate(schema, dataOnly); - - GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); - testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode(schema, dataOnly, rowData, copyRowData, otherRowData); - } - - @Test - public void testEmptyProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - generateAndValidate(schema, schema.select()); - - GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); - testEqualsAndHashCode(schema, schema.select(), rowData, copyRowData, otherRowData, true); - } - - @Test - public void testRename() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - Schema renamed = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get())); - generateAndValidate(schema, renamed); - - GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); - testEqualsAndHashCode(schema, renamed, rowData, copyRowData, otherRowData); - } - - @Test - public void testNestedProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 3, - "location", - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get())))); - - GenericRowData rowData = GenericRowData.of(1L, GenericRowData.of(1.0f, 1.0f)); - GenericRowData copyRowData = GenericRowData.of(1L, GenericRowData.of(1.0f, 1.0f)); - GenericRowData otherRowData = GenericRowData.of(2L, GenericRowData.of(2.0f, 2.0f)); - - GenericRowData rowDataNullStruct = GenericRowData.of(1L, null); - GenericRowData copyRowDataNullStruct = GenericRowData.of(1L, null); - GenericRowData otherRowDataNullStruct = GenericRowData.of(2L, null); - - // Project id only. - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - assertThat(idOnly.columns()).isNotEmpty(); - generateAndValidate(schema, idOnly); - testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode( - schema, idOnly, rowDataNullStruct, copyRowDataNullStruct, otherRowDataNullStruct); - - // Project lat only. - Schema latOnly = - new Schema( - Types.NestedField.optional( - 3, - "location", - Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); - assertThat(latOnly.columns()).isNotEmpty(); - generateAndValidate(schema, latOnly); - testEqualsAndHashCode(schema, latOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode( - schema, latOnly, rowDataNullStruct, copyRowDataNullStruct, otherRowDataNullStruct, true); - - // Project long only. - Schema longOnly = - new Schema( - Types.NestedField.optional( - 3, - "location", - Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); - assertThat(longOnly.columns()).isNotEmpty(); - generateAndValidate(schema, longOnly); - testEqualsAndHashCode(schema, longOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode( - schema, longOnly, rowDataNullStruct, copyRowDataNullStruct, otherRowDataNullStruct, true); - - // Project location. - Schema locationOnly = schema.select("location"); - assertThat(locationOnly.columns()).isNotEmpty(); - generateAndValidate(schema, locationOnly); - testEqualsAndHashCode(schema, locationOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode( - schema, - locationOnly, - rowDataNullStruct, - copyRowDataNullStruct, - otherRowDataNullStruct, - true); - } - - @Test - public void testPrimitivesFullProjection() { - DataGenerator dataGenerator = new DataGenerators.Primitives(); - Schema schema = dataGenerator.icebergSchema(); - generateAndValidate(schema, schema); - - GenericRowData rowData = dataGenerator.generateFlinkRowData(); - GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); - GenericRowData otherRowData = dataGenerator.generateFlinkRowData(); - // modify the string field value (position 6) - otherRowData.setField(6, StringData.fromString("foo_bar")); - testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); - - GenericRowData rowDataNullOptionalFields = dataGenerator.generateFlinkRowData(); - setOptionalFieldsNullForPrimitives(rowDataNullOptionalFields); - GenericRowData copyRowDataNullOptionalFields = dataGenerator.generateFlinkRowData(); - setOptionalFieldsNullForPrimitives(copyRowDataNullOptionalFields); - GenericRowData otherRowDataNullOptionalFields = dataGenerator.generateFlinkRowData(); - // modify the string field value (position 6) - otherRowDataNullOptionalFields.setField(6, StringData.fromString("foo_bar")); - setOptionalFieldsNullForPrimitives(otherRowData); - testEqualsAndHashCode( - schema, - schema, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - } - - private void setOptionalFieldsNullForPrimitives(GenericRowData rowData) { - // fields from [1, 5] range are optional - for (int pos = 1; pos <= 5; ++pos) { - rowData.setField(pos, null); - } - } - - @Test - public void testMapOfPrimitivesProjection() { - DataGenerator dataGenerator = new DataGenerators.MapOfPrimitives(); - Schema schema = dataGenerator.icebergSchema(); - - // Project id only. - Schema idOnly = schema.select("row_id"); - assertThat(idOnly.columns()).isNotEmpty(); - generateAndValidate(schema, idOnly); - - // Project map only. - Schema mapOnly = schema.select("map_of_primitives"); - assertThat(mapOnly.columns()).isNotEmpty(); - generateAndValidate(schema, mapOnly); - - // Project all. - generateAndValidate(schema, schema); - - GenericRowData rowData = dataGenerator.generateFlinkRowData(); - GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); - // modify the map field value - GenericRowData otherRowData = - GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericMapData( - ImmutableMap.of(StringData.fromString("foo"), 1, StringData.fromString("bar"), 2))); - testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData, true); - testEqualsAndHashCode(schema, mapOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); - - GenericRowData rowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("row_id_value"), null); - GenericRowData copyRowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("row_id_value"), null); - // modify the map field value - GenericRowData otherRowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("other_row_id_value"), null); - testEqualsAndHashCode( - schema, - idOnly, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - testEqualsAndHashCode( - schema, - mapOnly, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields, - true); - testEqualsAndHashCode( - schema, - schema, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - } - - @Test - public void testMapOfStructStructProjection() { - DataGenerator dataGenerator = new DataGenerators.MapOfStructStruct(); - Schema schema = dataGenerator.icebergSchema(); - - // Project id only. - Schema idOnly = schema.select("row_id"); - assertThat(idOnly.columns()).isNotEmpty(); - generateAndValidate(schema, idOnly); - - // Project map only. - Schema mapOnly = schema.select("map"); - assertThat(mapOnly.columns()).isNotEmpty(); - generateAndValidate(schema, mapOnly); - - // Project all. - generateAndValidate(schema, schema); - - // Project partial map key. - Schema partialMapKey = - new Schema( - Types.NestedField.optional( - 2, - "map", - Types.MapType.ofOptional( - 101, - 102, - Types.StructType.of( - Types.NestedField.required(201, "key", Types.LongType.get())), - Types.StructType.of( - Types.NestedField.required(203, "value", Types.LongType.get()), - Types.NestedField.required(204, "valueData", Types.StringType.get()))))); - assertThatThrownBy(() -> generateAndValidate(schema, partialMapKey)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Cannot project a partial map key or value struct."); - - // Project partial map key. - Schema partialMapValue = - new Schema( - Types.NestedField.optional( - 2, - "map", - Types.MapType.ofOptional( - 101, - 102, - Types.StructType.of( - Types.NestedField.required(201, "key", Types.LongType.get()), - Types.NestedField.required(202, "keyData", Types.StringType.get())), - Types.StructType.of( - Types.NestedField.required(203, "value", Types.LongType.get()))))); - assertThatThrownBy(() -> generateAndValidate(schema, partialMapValue)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Cannot project a partial map key or value struct."); - - GenericRowData rowData = dataGenerator.generateFlinkRowData(); - GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); - // modify the map field value - GenericRowData otherRowData = - GenericRowData.of( - StringData.fromString("other_row_id_value"), - new GenericMapData( - ImmutableMap.of( - GenericRowData.of(1L, StringData.fromString("other_key_data")), - GenericRowData.of(1L, StringData.fromString("other_value_data"))))); - testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode(schema, mapOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); - - GenericRowData rowDataNullOptionalFields = - GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericMapData( - ImmutableMap.of(GenericRowData.of(1L, null), GenericRowData.of(1L, null)))); - GenericRowData copyRowDataNullOptionalFields = - GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericMapData( - ImmutableMap.of(GenericRowData.of(1L, null), GenericRowData.of(1L, null)))); - // modify the map field value - GenericRowData otherRowDataNullOptionalFields = - GenericRowData.of( - StringData.fromString("other_row_id_value"), - new GenericMapData( - ImmutableMap.of(GenericRowData.of(2L, null), GenericRowData.of(2L, null)))); - testEqualsAndHashCode( - schema, - idOnly, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - testEqualsAndHashCode( - schema, - mapOnly, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - testEqualsAndHashCode( - schema, - schema, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - } - - @Test - public void testArrayOfPrimitiveProjection() { - DataGenerator dataGenerator = new DataGenerators.ArrayOfPrimitive(); - Schema schema = dataGenerator.icebergSchema(); - - // Project id only. - Schema idOnly = schema.select("row_id"); - assertThat(idOnly.columns()).isNotEmpty(); - generateAndValidate(schema, idOnly); - - // Project list only. - Schema arrayOnly = schema.select("array_of_int"); - assertThat(arrayOnly.columns()).isNotEmpty(); - generateAndValidate(schema, arrayOnly); - - // Project all. - generateAndValidate(schema, schema); - - GenericRowData rowData = dataGenerator.generateFlinkRowData(); - GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); - // modify the map field value - GenericRowData otherRowData = - GenericRowData.of( - StringData.fromString("other_row_id_value"), - new GenericArrayData(new Integer[] {4, 5, 6})); - testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode(schema, arrayOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); - - GenericRowData rowDataNullOptionalFields = - GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); - GenericRowData copyRowDataNullOptionalFields = - GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); - // modify the map field value - GenericRowData otherRowDataNullOptionalFields = - GenericRowData.of( - StringData.fromString("other_row_id_value"), - new GenericArrayData(new Integer[] {4, null, 6})); - testEqualsAndHashCode( - schema, - idOnly, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - testEqualsAndHashCode( - schema, - arrayOnly, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - testEqualsAndHashCode( - schema, - schema, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - } - - @Test - public void testArrayOfStructProjection() { - DataGenerator dataGenerator = new DataGenerators.ArrayOfStruct(); - Schema schema = dataGenerator.icebergSchema(); - - // Project id only. - Schema idOnly = schema.select("row_id"); - assertThat(idOnly.columns()).isNotEmpty(); - generateAndValidate(schema, idOnly); - - // Project list only. - Schema arrayOnly = schema.select("array_of_struct"); - assertThat(arrayOnly.columns()).isNotEmpty(); - generateAndValidate(schema, arrayOnly); - - // Project all. - generateAndValidate(schema, schema); - - // Project partial list value. - Schema partialList = - new Schema( - Types.NestedField.optional( - 2, - "array_of_struct", - Types.ListType.ofOptional( - 101, - Types.StructType.of( - Types.NestedField.required(202, "name", Types.StringType.get()))))); - - assertThatThrownBy(() -> generateAndValidate(schema, partialList)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Cannot project a partial list element struct."); - - GenericRowData rowData = dataGenerator.generateFlinkRowData(); - GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); - // modify the map field value - GenericRowData otherRowData = - GenericRowData.of( - StringData.fromString("row_id_value"), new GenericArrayData(new Integer[] {4, 5, 6})); - testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); - - GenericRowData rowDataNullOptionalFields = - GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); - GenericRowData copyRowDataNullOptionalFields = - GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); - // modify the map field value - GenericRowData otherRowDataNullOptionalFields = - GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {4, null, 6})); - testEqualsAndHashCode( - schema, - schema, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - } - - private void generateAndValidate(Schema schema, Schema projectSchema) { - int numRecords = 100; - List recordList = RandomGenericData.generate(schema, numRecords, 102L); - List rowDataList = - Lists.newArrayList(RandomRowData.generate(schema, numRecords, 102L).iterator()); - assertThat(rowDataList).hasSize(recordList.size()); - - StructProjection structProjection = StructProjection.create(schema, projectSchema); - RowDataProjection rowDataProjection = RowDataProjection.create(schema, projectSchema); - - for (int i = 0; i < numRecords; i++) { - StructLike expected = structProjection.wrap(recordList.get(i)); - RowData projected = rowDataProjection.wrap(rowDataList.get(i)); - TestHelpers.assertRowData(projectSchema, expected, projected); - - assertThat(projected).isEqualTo(projected); - assertThat(projected).hasSameHashCodeAs(projected); - // make sure toString doesn't throw NPE for null values - assertThatNoException().isThrownBy(projected::toString); - } - } - - private void testEqualsAndHashCode( - Schema schema, - Schema projectionSchema, - RowData rowData, - RowData copyRowData, - RowData otherRowData) { - testEqualsAndHashCode(schema, projectionSchema, rowData, copyRowData, otherRowData, false); - } - - /** - * @param isOtherRowDataSameAsRowData sometimes projection on otherRowData can result in the same - * RowData, e.g. due to empty projection or null struct - */ - private void testEqualsAndHashCode( - Schema schema, - Schema projectionSchema, - RowData rowData, - RowData copyRowData, - RowData otherRowData, - boolean isOtherRowDataSameAsRowData) { - RowDataProjection projection = RowDataProjection.create(schema, projectionSchema); - RowDataProjection copyProjection = RowDataProjection.create(schema, projectionSchema); - RowDataProjection otherProjection = RowDataProjection.create(schema, projectionSchema); - - assertThat(projection.wrap(rowData)).isEqualTo(copyProjection.wrap(copyRowData)); - assertThat(projection.wrap(rowData)).hasSameHashCodeAs(copyProjection.wrap(copyRowData)); - - if (isOtherRowDataSameAsRowData) { - assertThat(projection.wrap(rowData)).isEqualTo(otherProjection.wrap(otherRowData)); - assertThat(projection.wrap(rowData)).hasSameHashCodeAs(otherProjection.wrap(otherRowData)); - } else { - assertThat(projection.wrap(rowData)).isNotEqualTo(otherProjection.wrap(otherRowData)); - assertThat(projection.wrap(rowData)) - .doesNotHaveSameHashCodeAs(otherProjection.wrap(otherRowData)); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java deleted file mode 100644 index 004cc8234876..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java +++ /dev/null @@ -1,583 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.withPrecision; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.types.Comparators; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public class TestRowProjection { - - @TempDir private Path temp; - - private RowData writeAndRead(String desc, Schema writeSchema, Schema readSchema, RowData row) - throws IOException { - File file = File.createTempFile("junit", desc + ".avro", temp.toFile()); - assertThat(file.delete()).isTrue(); - - try (FileAppender appender = - Avro.write(Files.localOutput(file)) - .schema(writeSchema) - .createWriterFunc(ignore -> new FlinkAvroWriter(FlinkSchemaUtil.convert(writeSchema))) - .build()) { - appender.add(row); - } - - Avro.ReadBuilder builder = - Avro.read(Files.localInput(file)) - .project(readSchema) - .createResolvingReader(FlinkPlannedAvroReader::create); - - Iterable records = builder.build(); - - return Iterables.getOnlyElement(records); - } - - @Test - public void testFullProjection() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - RowData projected = writeAndRead("full_projection", schema, schema, row); - - assertThat(projected.getLong(0)).isEqualTo(34); - assertThat(projected.getString(1)).asString().isEqualTo("test"); - } - - @Test - public void testSpecialCharacterProjection() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(0, "user id", Types.LongType.get()), - Types.NestedField.optional(1, "data%0", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - RowData full = writeAndRead("special_chars", schema, schema, row); - - assertThat(full.getLong(0)).isEqualTo(34L); - assertThat(full.getString(1)).asString().isEqualTo("test"); - - RowData projected = writeAndRead("special_characters", schema, schema.select("data%0"), full); - - assertThat(projected.getArity()).isEqualTo(1); - assertThat(projected.getString(0)).asString().isEqualTo("test"); - } - - @Test - public void testReorderedFullProjection() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - Schema reordered = - new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("full_projection", schema, reordered, row); - - assertThat(projected.getString(0)).asString().isEqualTo("test"); - assertThat(projected.getLong(1)).isEqualTo(34); - } - - @Test - public void testReorderedProjection() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - Schema reordered = - new Schema( - Types.NestedField.optional(2, "missing_1", Types.StringType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(3, "missing_2", Types.LongType.get())); - - RowData projected = writeAndRead("full_projection", schema, reordered, row); - - assertThat(projected.isNullAt(0)).isTrue(); - assertThat(projected.getString(1)).asString().isEqualTo("test"); - assertThat(projected.isNullAt(2)).isTrue(); - } - - @Test - public void testRenamedAddedField() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(1, "a", Types.LongType.get()), - Types.NestedField.required(2, "b", Types.LongType.get()), - Types.NestedField.required(3, "d", Types.LongType.get())); - - RowData row = GenericRowData.of(100L, 200L, 300L); - - Schema renamedAdded = - new Schema( - Types.NestedField.optional(1, "a", Types.LongType.get()), - Types.NestedField.optional(2, "b", Types.LongType.get()), - Types.NestedField.optional(3, "c", Types.LongType.get()), - Types.NestedField.optional(4, "d", Types.LongType.get())); - - RowData projected = writeAndRead("rename_and_add_column_projection", schema, renamedAdded, row); - assertThat(projected.getLong(0)) - .as("Should contain the correct value in column 1") - .isEqualTo(100L); - assertThat(projected.getLong(1)) - .as("Should contain the correct value in column 2") - .isEqualTo(200L); - assertThat(projected.getLong(2)) - .as("Should contain the correct value in column 1") - .isEqualTo(300L); - assertThat(projected.isNullAt(3)).as("Should contain empty value on new column 4").isTrue(); - } - - @Test - public void testEmptyProjection() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - RowData projected = writeAndRead("empty_projection", schema, schema.select(), row); - - assertThat(projected).isNotNull(); - assertThat(projected.getArity()).isEqualTo(0); - } - - @Test - public void testBasicProjection() throws Exception { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("basic_projection_id", writeSchema, idOnly, row); - assertThat(projected.getArity()).as("Should not project data").isEqualTo(1); - assertThat(projected.getLong(0)).isEqualTo(34L); - - Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); - - projected = writeAndRead("basic_projection_data", writeSchema, dataOnly, row); - - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - int cmp = Comparators.charSequences().compare("test", projected.getString(0).toString()); - assertThat(projected.getString(0)).asString().isEqualTo("test"); - } - - @Test - public void testRename() throws Exception { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - Schema readSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get())); - - RowData projected = writeAndRead("project_and_rename", writeSchema, readSchema, row); - - assertThat(projected.getLong(0)).isEqualTo(34L); - assertThat(projected.getString(1)) - .as("Should contain the correct data/renamed value") - .asString() - .isEqualTo("test"); - } - - @Test - public void testNestedStructProjection() throws Exception { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 3, - "location", - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get())))); - - RowData location = GenericRowData.of(52.995143f, -1.539054f); - RowData record = GenericRowData.of(34L, location); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("id_only", writeSchema, idOnly, record); - assertThat(projected.getArity()).isEqualTo(1); - assertThat(projected.getLong(0)).as("Should contain the correct id value").isEqualTo(34L); - - Schema latOnly = - new Schema( - Types.NestedField.optional( - 3, - "location", - Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); - - projected = writeAndRead("latitude_only", writeSchema, latOnly, record); - RowData projectedLocation = projected.getRow(0, 1); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.isNullAt(0)).as("Should project location").isFalse(); - assertThat(projectedLocation.getArity()).as("Should not project longitude").isEqualTo(1); - assertThat(projectedLocation.getFloat(0)) - .as("Should project latitude") - .isEqualTo(52.995143f, withPrecision(0.000001f)); - - Schema longOnly = - new Schema( - Types.NestedField.optional( - 3, - "location", - Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); - - projected = writeAndRead("longitude_only", writeSchema, longOnly, record); - projectedLocation = projected.getRow(0, 1); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.isNullAt(0)).as("Should project location").isFalse(); - assertThat(projectedLocation.getArity()).as("Should not project latitutde").isEqualTo(1); - assertThat(projectedLocation.getFloat(0)) - .as("Should project longitude") - .isEqualTo(-1.539054f, withPrecision(0.000001f)); - - Schema locationOnly = writeSchema.select("location"); - projected = writeAndRead("location_only", writeSchema, locationOnly, record); - projectedLocation = projected.getRow(0, 1); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.isNullAt(0)).as("Should project location").isFalse(); - assertThat(projectedLocation.getFloat(0)) - .as("Should project latitude") - .isEqualTo(52.995143f, withPrecision(0.000001f)); - assertThat(projectedLocation.getFloat(1)) - .as("Should project longitude") - .isEqualTo(-1.539054f, withPrecision(0.000001f)); - } - - @Test - public void testMapProjection() throws IOException { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 5, - "properties", - Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get()))); - - GenericMapData properties = - new GenericMapData( - ImmutableMap.of( - StringData.fromString("a"), - StringData.fromString("A"), - StringData.fromString("b"), - StringData.fromString("B"))); - - RowData row = GenericRowData.of(34L, properties); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); - assertThat(projected.getLong(0)).isEqualTo(34L); - assertThat(projected.getArity()).as("Should not project properties map").isEqualTo(1); - - Schema keyOnly = writeSchema.select("properties.key"); - projected = writeAndRead("key_only", writeSchema, keyOnly, row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.getMap(0)).isEqualTo(properties); - - Schema valueOnly = writeSchema.select("properties.value"); - projected = writeAndRead("value_only", writeSchema, valueOnly, row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.getMap(0)).isEqualTo(properties); - - Schema mapOnly = writeSchema.select("properties"); - projected = writeAndRead("map_only", writeSchema, mapOnly, row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.getMap(0)).isEqualTo(properties); - } - - @Test - public void testMapOfStructsProjection() throws IOException { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 5, - "locations", - Types.MapType.ofOptional( - 6, - 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()))))); - - RowData l1 = GenericRowData.of(53.992811f, -1.542616f); - RowData l2 = GenericRowData.of(52.995143f, -1.539054f); - GenericMapData map = - new GenericMapData( - ImmutableMap.of(StringData.fromString("L1"), l1, StringData.fromString("L2"), l2)); - RowData row = GenericRowData.of(34L, map); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); - assertThat(projected.getLong(0)).isEqualTo(34L); - assertThat(projected.getArity()).as("Should not project locations map").isEqualTo(1); - - projected = writeAndRead("all_locations", writeSchema, writeSchema.select("locations"), row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.getMap(0)).isEqualTo(row.getMap(1)); - - projected = writeAndRead("lat_only", writeSchema, writeSchema.select("locations.lat"), row); - GenericMapData locations = (GenericMapData) projected.getMap(0); - assertThat(locations).isNotNull(); - GenericArrayData l1l2Array = - new GenericArrayData( - new Object[] {StringData.fromString("L2"), StringData.fromString("L1")}); - assertThat(locations.keyArray()).isEqualTo(l1l2Array); - RowData projectedL1 = (RowData) locations.get(StringData.fromString("L1")); - assertThat(projectedL1).isNotNull(); - assertThat(projectedL1.getFloat(0)) - .as("L1 should contain lat") - .isEqualTo(53.992811f, withPrecision(0.000001f)); - assertThat(projectedL1.getArity()).as("L1 should not contain long").isEqualTo(1); - RowData projectedL2 = (RowData) locations.get(StringData.fromString("L2")); - assertThat(projectedL2).isNotNull(); - assertThat(projectedL2.getFloat(0)) - .as("L2 should contain lat") - .isEqualTo(52.995143f, withPrecision(0.000001f)); - assertThat(projectedL2.getArity()).as("L2 should not contain long").isEqualTo(1); - - projected = writeAndRead("long_only", writeSchema, writeSchema.select("locations.long"), row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - locations = (GenericMapData) projected.getMap(0); - assertThat(locations).isNotNull(); - assertThat(locations.keyArray()).isEqualTo(l1l2Array); - projectedL1 = (RowData) locations.get(StringData.fromString("L1")); - assertThat(projectedL1).isNotNull(); - assertThat(projectedL1.getArity()).as("L1 should not contain lat").isEqualTo(1); - assertThat(projectedL1.getFloat(0)) - .as("L1 should contain long") - .isEqualTo(-1.542616f, withPrecision(0.000001f)); - projectedL2 = (RowData) locations.get(StringData.fromString("L2")); - assertThat(projectedL2).isNotNull(); - assertThat(projectedL2.getArity()).as("L2 should not contain lat").isEqualTo(1); - assertThat(projectedL2.getFloat(0)) - .as("L2 should contain long") - .isEqualTo(-1.539054f, withPrecision(0.000001f)); - - Schema latitiudeRenamed = - new Schema( - Types.NestedField.optional( - 5, - "locations", - Types.MapType.ofOptional( - 6, - 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "latitude", Types.FloatType.get()))))); - - projected = writeAndRead("latitude_renamed", writeSchema, latitiudeRenamed, row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - locations = (GenericMapData) projected.getMap(0); - assertThat(locations).isNotNull(); - assertThat(locations.keyArray()).isEqualTo(l1l2Array); - projectedL1 = (RowData) locations.get(StringData.fromString("L1")); - assertThat(projectedL1).isNotNull(); - assertThat(projectedL1.getFloat(0)) - .as("L1 should contain latitude") - .isEqualTo(53.992811f, withPrecision(0.000001f)); - projectedL2 = (RowData) locations.get(StringData.fromString("L2")); - assertThat(projectedL2).isNotNull(); - assertThat(projectedL2.getFloat(0)) - .as("L2 should contain latitude") - .isEqualTo(52.995143f, withPrecision(0.000001f)); - } - - @Test - public void testListProjection() throws IOException { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 10, "values", Types.ListType.ofOptional(11, Types.LongType.get()))); - - GenericArrayData values = new GenericArrayData(new Long[] {56L, 57L, 58L}); - - RowData row = GenericRowData.of(34L, values); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); - assertThat(projected.getLong(0)).isEqualTo(34L); - assertThat(projected.getArity()).as("Should not project values list").isEqualTo(1); - - Schema elementOnly = writeSchema.select("values.element"); - projected = writeAndRead("element_only", writeSchema, elementOnly, row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.getArray(0)).isEqualTo(values); - - Schema listOnly = writeSchema.select("values"); - projected = writeAndRead("list_only", writeSchema, listOnly, row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.getArray(0)).isEqualTo(values); - } - - @Test - @SuppressWarnings("unchecked") - public void testListOfStructsProjection() throws IOException { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 22, - "points", - Types.ListType.ofOptional( - 21, - Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()))))); - - RowData p1 = GenericRowData.of(1, 2); - RowData p2 = GenericRowData.of(3, null); - GenericArrayData arrayData = new GenericArrayData(new RowData[] {p1, p2}); - RowData row = GenericRowData.of(34L, arrayData); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); - assertThat(projected.getLong(0)).isEqualTo(34L); - assertThat(projected.getArity()).isEqualTo(1); - - projected = writeAndRead("all_points", writeSchema, writeSchema.select("points"), row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.getArray(0)).isEqualTo(row.getArray(1)); - - projected = writeAndRead("x_only", writeSchema, writeSchema.select("points.x"), row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.isNullAt(0)).isFalse(); - ArrayData points = projected.getArray(0); - assertThat(points.size()).isEqualTo(2); - RowData projectedP1 = points.getRow(0, 2); - assertThat(projectedP1.getInt(0)).as("Should project x").isEqualTo(1); - assertThat(projectedP1.getArity()).as("Should not project y").isEqualTo(1); - RowData projectedP2 = points.getRow(1, 2); - assertThat(projectedP2.getArity()).as("Should not project y").isEqualTo(1); - assertThat(projectedP2.getInt(0)).as("Should project x").isEqualTo(3); - - projected = writeAndRead("y_only", writeSchema, writeSchema.select("points.y"), row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.isNullAt(0)).isFalse(); - points = projected.getArray(0); - assertThat(points.size()).isEqualTo(2); - projectedP1 = points.getRow(0, 2); - assertThat(projectedP1.getArity()).as("Should not project x").isEqualTo(1); - assertThat(projectedP1.getInt(0)).as("Should project y").isEqualTo(2); - projectedP2 = points.getRow(1, 2); - assertThat(projectedP2.getArity()).as("Should not project x").isEqualTo(1); - assertThat(projectedP2.isNullAt(0)).as("Should project null y").isTrue(); - - Schema yRenamed = - new Schema( - Types.NestedField.optional( - 22, - "points", - Types.ListType.ofOptional( - 21, - Types.StructType.of( - Types.NestedField.optional(18, "z", Types.IntegerType.get()))))); - - projected = writeAndRead("y_renamed", writeSchema, yRenamed, row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.isNullAt(0)).isFalse(); - points = projected.getArray(0); - assertThat(points.size()).isEqualTo(2); - projectedP1 = points.getRow(0, 2); - assertThat(projectedP1.getArity()).as("Should not project x and y").isEqualTo(1); - assertThat(projectedP1.getInt(0)).as("Should project z").isEqualTo(2); - projectedP2 = points.getRow(1, 2); - assertThat(projectedP2.getArity()).as("Should not project x and y").isEqualTo(1); - assertThat(projectedP2.isNullAt(0)).as("Should project null z").isTrue(); - } - - @Test - public void testAddedFieldsWithRequiredChildren() throws Exception { - Schema schema = new Schema(Types.NestedField.required(1, "a", Types.LongType.get())); - - RowData row = GenericRowData.of(100L); - - Schema addedFields = - new Schema( - Types.NestedField.optional(1, "a", Types.LongType.get()), - Types.NestedField.optional( - 2, - "b", - Types.StructType.of(Types.NestedField.required(3, "c", Types.LongType.get()))), - Types.NestedField.optional(4, "d", Types.ListType.ofRequired(5, Types.LongType.get())), - Types.NestedField.optional( - 6, - "e", - Types.MapType.ofRequired(7, 8, Types.LongType.get(), Types.LongType.get()))); - - RowData projected = - writeAndRead("add_fields_with_required_children_projection", schema, addedFields, row); - assertThat(projected.getLong(0)) - .as("Should contain the correct value in column 1") - .isEqualTo(100L); - assertThat(projected.isNullAt(1)).as("Should contain empty value in new column 2").isTrue(); - assertThat(projected.isNullAt(2)).as("Should contain empty value in new column 4").isTrue(); - assertThat(projected.isNullAt(3)).as("Should contain empty value in new column 6").isTrue(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java deleted file mode 100644 index eccab20e04fc..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.flink.DataGenerator; -import org.apache.iceberg.flink.DataGenerators; -import org.apache.iceberg.flink.TestHelpers; -import org.junit.jupiter.api.Test; - -public class TestStructRowData { - - protected void testConverter(DataGenerator dataGenerator) { - StructRowData converter = new StructRowData(dataGenerator.icebergSchema().asStruct()); - GenericRecord expected = dataGenerator.generateIcebergGenericRecord(); - StructRowData actual = converter.setStruct(expected); - TestHelpers.assertRowData(dataGenerator.icebergSchema(), expected, actual); - } - - @Test - public void testPrimitiveTypes() { - testConverter(new DataGenerators.Primitives()); - } - - @Test - public void testStructOfPrimitive() { - testConverter(new DataGenerators.StructOfPrimitive()); - } - - @Test - public void testStructOfArray() { - testConverter(new DataGenerators.StructOfArray()); - } - - @Test - public void testStructOfMap() { - testConverter(new DataGenerators.StructOfMap()); - } - - @Test - public void testStructOfStruct() { - testConverter(new DataGenerators.StructOfStruct()); - } - - @Test - public void testArrayOfPrimitive() { - testConverter(new DataGenerators.ArrayOfPrimitive()); - } - - @Test - public void testArrayOfArray() { - testConverter(new DataGenerators.ArrayOfArray()); - } - - @Test - public void testArrayOfMap() { - testConverter(new DataGenerators.ArrayOfMap()); - } - - @Test - public void testArrayOfStruct() { - testConverter(new DataGenerators.ArrayOfStruct()); - } - - @Test - public void testMapOfPrimitives() { - testConverter(new DataGenerators.MapOfPrimitives()); - } - - @Test - public void testMapOfArray() { - testConverter(new DataGenerators.MapOfArray()); - } - - @Test - public void testMapOfMap() { - testConverter(new DataGenerators.MapOfMap()); - } - - @Test - public void testMapOfStruct() { - testConverter(new DataGenerators.MapOfStruct()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskInfraExtension.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskInfraExtension.java deleted file mode 100644 index 10efb9120c6e..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskInfraExtension.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import static org.apache.iceberg.flink.maintenance.operator.OperatorTestBase.IGNORED_OPERATOR_NAME; - -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.iceberg.flink.maintenance.operator.CollectingSink; -import org.apache.iceberg.flink.maintenance.operator.ManualSource; -import org.junit.jupiter.api.extension.BeforeEachCallback; -import org.junit.jupiter.api.extension.ExtensionContext; - -/** - * {@link org.junit.jupiter.api.extension.Extension} used to generate the common elements for the - * {@link MaintenanceTaskBuilder} implementations. These are the following: - * - *

      - *
    • {@link StreamExecutionEnvironment} - environment for testing - *
    • {@link ManualSource} - source for manually emitting {@link Trigger}s - *
    • {@link DataStream} - which generated from the {@link ManualSource} - *
    • {@link CollectingSink} - which could be used poll for the records emitted by the - * maintenance tasks - *
    - */ -class MaintenanceTaskInfraExtension implements BeforeEachCallback { - private StreamExecutionEnvironment env; - private ManualSource source; - private DataStream triggerStream; - private CollectingSink sink; - - @Override - public void beforeEach(ExtensionContext context) { - this.env = StreamExecutionEnvironment.getExecutionEnvironment(); - this.source = new ManualSource<>(env, TypeInformation.of(Trigger.class)); - // Adds the watermark to mimic the behaviour expected for the input of the maintenance tasks - this.triggerStream = - source - .dataStream() - .assignTimestampsAndWatermarks(new TableMaintenance.PunctuatedWatermarkStrategy()) - .name(IGNORED_OPERATOR_NAME) - .forceNonParallel(); - this.sink = new CollectingSink<>(); - } - - StreamExecutionEnvironment env() { - return env; - } - - ManualSource source() { - return source; - } - - DataStream triggerStream() { - return triggerStream; - } - - CollectingSink sink() { - return sink; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskTestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskTestBase.java deleted file mode 100644 index fc8f7ad5124e..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/MaintenanceTaskTestBase.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.time.Duration; -import java.util.function.Supplier; -import org.apache.flink.core.execution.JobClient; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.iceberg.flink.maintenance.operator.CollectingSink; -import org.apache.iceberg.flink.maintenance.operator.ManualSource; -import org.apache.iceberg.flink.maintenance.operator.OperatorTestBase; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.extension.RegisterExtension; - -class MaintenanceTaskTestBase extends OperatorTestBase { - private static final int TESTING_TASK_ID = 0; - private static final Duration POLL_DURATION = Duration.ofSeconds(5); - - @RegisterExtension MaintenanceTaskInfraExtension infra = new MaintenanceTaskInfraExtension(); - - void runAndWaitForSuccess( - StreamExecutionEnvironment env, - ManualSource triggerSource, - CollectingSink collectingSink) - throws Exception { - runAndWaitForResult( - env, - triggerSource, - collectingSink, - false /* generateFailure */, - () -> true /* waitForCondition */, - true /* resultSuccess */); - } - - void runAndWaitForSuccess( - StreamExecutionEnvironment env, - ManualSource triggerSource, - CollectingSink collectingSink, - Supplier waitForCondition) - throws Exception { - runAndWaitForResult( - env, - triggerSource, - collectingSink, - false /* generateFailure */, - waitForCondition, - true /* resultSuccess */); - } - - void runAndWaitForFailure( - StreamExecutionEnvironment env, - ManualSource triggerSource, - CollectingSink collectingSink) - throws Exception { - runAndWaitForResult( - env, - triggerSource, - collectingSink, - true /* generateFailure */, - () -> true /* waitForCondition */, - true /* resultSuccess */); - } - - void runAndWaitForResult( - StreamExecutionEnvironment env, - ManualSource triggerSource, - CollectingSink collectingSink, - boolean generateFailure, - Supplier waitForCondition, - boolean resultSuccess) - throws Exception { - JobClient jobClient = null; - try { - jobClient = env.executeAsync(); - - // Do a single successful task run - long time = System.currentTimeMillis(); - triggerSource.sendRecord(Trigger.create(time, TESTING_TASK_ID), time); - - TaskResult result = collectingSink.poll(POLL_DURATION); - - assertThat(result.startEpoch()).isEqualTo(time); - assertThat(result.success()).isEqualTo(resultSuccess); - assertThat(result.taskIndex()).isEqualTo(TESTING_TASK_ID); - - if (generateFailure) { - dropTable(); - time = System.currentTimeMillis(); - triggerSource.sendRecord(Trigger.create(time, TESTING_TASK_ID), time); - result = collectingSink.poll(POLL_DURATION); - - assertThat(result.startEpoch()).isEqualTo(time); - assertThat(result.success()).isFalse(); - assertThat(result.taskIndex()).isEqualTo(TESTING_TASK_ID); - } - - Awaitility.await().until(waitForCondition::get); - } finally { - closeJobClient(jobClient); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestDeleteOrphanFiles.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestDeleteOrphanFiles.java deleted file mode 100644 index 12f5269773d1..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestDeleteOrphanFiles.java +++ /dev/null @@ -1,340 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import static org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles.DELETE_FILES_TASK_NAME; -import static org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles.FILESYSTEM_FILES_TASK_NAME; -import static org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles.METADATA_FILES_TASK_NAME; -import static org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles.PLANNER_TASK_NAME; -import static org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles.READER_TASK_NAME; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.DELETE_FILE_FAILED_COUNTER; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.DELETE_FILE_SUCCEEDED_COUNTER; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.ERROR_COUNTER; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.nio.file.FileSystems; -import java.nio.file.Files; -import java.nio.file.Path; -import java.time.Duration; -import java.util.List; -import java.util.stream.StreamSupport; -import org.apache.flink.streaming.api.graph.StreamGraphGenerator; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.maintenance.operator.MetricsReporterFactoryForTests; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.Test; - -class TestDeleteOrphanFiles extends MaintenanceTaskTestBase { - - private Path relative(Table table, String relativePath) { - return FileSystems.getDefault().getPath(table.location().substring(5), relativePath); - } - - private void createFiles(Path... paths) throws IOException { - for (Path path : paths) { - Files.write(path, "DUMMY".getBytes(StandardCharsets.UTF_8)); - } - } - - @Test - void testDeleteOrphanFilesUnPartitioned() throws Exception { - Table table = createTable(); - insert(table, 1, "a"); - insert(table, 2, "b"); - insert(table, 3, "c"); - insert(table, 4, "d"); - - assertFileNum(table, 4, 0); - - Path inData = relative(table, "metadata/in_data"); - Path inMetadata = relative(table, "metadata/in_metadata"); - - createFiles(inData); - createFiles(inMetadata); - assertThat(inMetadata).exists(); - assertThat(inData).exists(); - - appendDeleteOrphanFiles(); - - runAndWaitForSuccess( - infra.env(), infra.source(), infra.sink(), () -> checkDeleteFinished(table.name(), 2L)); - assertThat(inMetadata).doesNotExist(); - assertThat(inData).doesNotExist(); - assertFileNum(table, 4, 0); - - // Check the metrics - MetricsReporterFactoryForTests.assertCounters( - new ImmutableMap.Builder, Long>() - .put( - ImmutableList.of( - PLANNER_TASK_NAME + "[0]", table.name(), DUMMY_TASK_NAME, "0", ERROR_COUNTER), - 0L) - .put( - ImmutableList.of( - READER_TASK_NAME + "[0]", table.name(), DUMMY_TASK_NAME, "0", ERROR_COUNTER), - 0L) - .put( - ImmutableList.of( - FILESYSTEM_FILES_TASK_NAME + "[0]", - table.name(), - DUMMY_TASK_NAME, - "0", - ERROR_COUNTER), - 0L) - .put( - ImmutableList.of( - METADATA_FILES_TASK_NAME + "[0]", - table.name(), - DUMMY_TASK_NAME, - "0", - ERROR_COUNTER), - 0L) - .put( - ImmutableList.of( - DELETE_FILES_TASK_NAME + "[0]", - table.name(), - DUMMY_TASK_NAME, - "0", - DELETE_FILE_FAILED_COUNTER), - 0L) - .put( - ImmutableList.of( - DELETE_FILES_TASK_NAME + "[0]", - table.name(), - DUMMY_TASK_NAME, - "0", - DELETE_FILE_SUCCEEDED_COUNTER), - 2L) - .build()); - } - - @Test - void testDeleteOrphanFilesPartitioned() throws Exception { - Table table = createPartitionedTable(); - insertPartitioned(table, 1, "p1"); - insertPartitioned(table, 2, "p1"); - insertPartitioned(table, 3, "p2"); - insertPartitioned(table, 4, "p2"); - - assertFileNum(table, 4, 0); - - Path inMetadata = relative(table, "metadata/in_metadata"); - Path inData = relative(table, "metadata/in_data"); - - createFiles(inMetadata); - createFiles(inData); - assertThat(inMetadata).exists(); - assertThat(inData).exists(); - - appendDeleteOrphanFiles(); - - runAndWaitForSuccess( - infra.env(), infra.source(), infra.sink(), () -> checkDeleteFinished(table.name(), 2L)); - assertThat(inMetadata).doesNotExist(); - assertThat(inData).doesNotExist(); - - assertFileNum(table, 4, 0); - - // Check the metrics - MetricsReporterFactoryForTests.assertCounters( - new ImmutableMap.Builder, Long>() - .put( - ImmutableList.of( - PLANNER_TASK_NAME + "[0]", table.name(), DUMMY_TASK_NAME, "0", ERROR_COUNTER), - 0L) - .put( - ImmutableList.of( - READER_TASK_NAME + "[0]", table.name(), DUMMY_TASK_NAME, "0", ERROR_COUNTER), - 0L) - .put( - ImmutableList.of( - FILESYSTEM_FILES_TASK_NAME + "[0]", - table.name(), - DUMMY_TASK_NAME, - "0", - ERROR_COUNTER), - 0L) - .put( - ImmutableList.of( - METADATA_FILES_TASK_NAME + "[0]", - table.name(), - DUMMY_TASK_NAME, - "0", - ERROR_COUNTER), - 0L) - .put( - ImmutableList.of( - DELETE_FILES_TASK_NAME + "[0]", - table.name(), - DUMMY_TASK_NAME, - "0", - DELETE_FILE_FAILED_COUNTER), - 0L) - .put( - ImmutableList.of( - DELETE_FILES_TASK_NAME + "[0]", - table.name(), - DUMMY_TASK_NAME, - "0", - DELETE_FILE_SUCCEEDED_COUNTER), - 2L) - .build()); - } - - @Test - void testDeleteOrphanFilesFailure() throws Exception { - Table table = createTable(); - insert(table, 1, "a"); - insert(table, 2, "b"); - insert(table, 3, "c"); - insert(table, 4, "d"); - - assertFileNum(table, 4, 0); - - Path inData = relative(table, "metadata/in_data"); - Path inMetadata = relative(table, "metadata/in_metadata"); - - createFiles(inData); - createFiles(inMetadata); - assertThat(inMetadata).exists(); - assertThat(inData).exists(); - - appendDeleteOrphanFiles(); - - // Mock error in the delete files operator - Long parentId = table.currentSnapshot().parentId(); - for (ManifestFile manifestFile : table.snapshot(parentId).allManifests(table.io())) { - table.io().deleteFile(manifestFile.path()); - } - - runAndWaitForResult( - infra.env(), - infra.source(), - infra.sink(), - false /* generateFailure */, - () -> checkDeleteFinished(table.name(), 0L), - false /* resultSuccess*/); - - // An error occurred; the file should not be deleted. And the job should not be failed. - assertThat(inMetadata).exists(); - assertThat(inData).exists(); - - // Check the metrics - MetricsReporterFactoryForTests.assertCounters( - new ImmutableMap.Builder, Long>() - .put( - ImmutableList.of( - PLANNER_TASK_NAME + "[0]", table.name(), DUMMY_TASK_NAME, "0", ERROR_COUNTER), - 0L) - .put( - ImmutableList.of( - READER_TASK_NAME + "[0]", table.name(), DUMMY_TASK_NAME, "0", ERROR_COUNTER), - 1L) - .put( - ImmutableList.of( - FILESYSTEM_FILES_TASK_NAME + "[0]", - table.name(), - DUMMY_TASK_NAME, - "0", - ERROR_COUNTER), - 0L) - .put( - ImmutableList.of( - METADATA_FILES_TASK_NAME + "[0]", - table.name(), - DUMMY_TASK_NAME, - "0", - ERROR_COUNTER), - 0L) - .put( - ImmutableList.of( - DELETE_FILES_TASK_NAME + "[0]", - table.name(), - DUMMY_TASK_NAME, - "0", - DELETE_FILE_FAILED_COUNTER), - 0L) - .put( - ImmutableList.of( - DELETE_FILES_TASK_NAME + "[0]", - table.name(), - DUMMY_TASK_NAME, - "0", - DELETE_FILE_SUCCEEDED_COUNTER), - 0L) - .build()); - } - - private void appendDeleteOrphanFiles() { - appendDeleteOrphanFiles(DeleteOrphanFiles.builder().minAge(Duration.ZERO)); - } - - private void appendDeleteOrphanFiles(DeleteOrphanFiles.Builder builder) { - builder - .append( - infra.triggerStream(), - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - 0, - tableLoader(), - UID_SUFFIX, - StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP, - 1) - .sinkTo(infra.sink()); - } - - private static void assertFileNum( - Table table, int expectedDataFileNum, int expectedDeleteFileNum) { - table.refresh(); - assertThat( - table.currentSnapshot().dataManifests(table.io()).stream() - .flatMap( - m -> - StreamSupport.stream( - ManifestFiles.read(m, table.io(), table.specs()).spliterator(), false)) - .count()) - .isEqualTo(expectedDataFileNum); - assertThat( - table.currentSnapshot().deleteManifests(table.io()).stream() - .flatMap( - m -> - StreamSupport.stream( - ManifestFiles.readDeleteManifest(m, table.io(), table.specs()) - .spliterator(), - false)) - .count()) - .isEqualTo(expectedDeleteFileNum); - } - - private static boolean checkDeleteFinished(String tableName, Long expectedDeleteNum) { - return expectedDeleteNum.equals( - MetricsReporterFactoryForTests.counter( - ImmutableList.of( - DELETE_FILES_TASK_NAME + "[0]", - tableName, - DUMMY_TASK_NAME, - "0", - DELETE_FILE_SUCCEEDED_COUNTER))); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestExpireSnapshots.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestExpireSnapshots.java deleted file mode 100644 index b8aa259e2f17..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestExpireSnapshots.java +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import static org.apache.iceberg.flink.SimpleDataUtil.createRecord; -import static org.apache.iceberg.flink.maintenance.api.ExpireSnapshots.DELETE_FILES_OPERATOR_NAME; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.DELETE_FILE_FAILED_COUNTER; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.DELETE_FILE_SUCCEEDED_COUNTER; -import static org.assertj.core.api.Assertions.assertThat; - -import java.time.Duration; -import java.util.List; -import java.util.Set; -import org.apache.flink.streaming.api.graph.StreamGraphGenerator; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.maintenance.operator.MetricsReporterFactoryForTests; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -class TestExpireSnapshots extends MaintenanceTaskTestBase { - private Table table; - - @BeforeEach - void before() { - MetricsReporterFactoryForTests.reset(); - this.table = createTable(); - tableLoader().open(); - } - - @Test - void testExpireSnapshots() throws Exception { - insert(table, 1, "a"); - insert(table, 2, "b"); - insert(table, 3, "c"); - insert(table, 4, "d"); - - Set snapshots = Sets.newHashSet(table.snapshots()); - assertThat(snapshots).hasSize(4); - - ExpireSnapshots.builder() - .parallelism(1) - .planningWorkerPoolSize(2) - .deleteBatchSize(3) - .maxSnapshotAge(Duration.ZERO) - .retainLast(1) - .uidSuffix(UID_SUFFIX) - .append( - infra.triggerStream(), - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - 0, - tableLoader(), - "OTHER", - StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP, - 1) - .sinkTo(infra.sink()); - - runAndWaitForSuccess( - infra.env(), infra.source(), infra.sink(), () -> checkDeleteFinished(table.name(), 3L)); - - table.refresh(); - assertThat(Sets.newHashSet(table.snapshots())).hasSize(1); - // Check that the table data not changed - SimpleDataUtil.assertTableRecords( - table, - ImmutableList.of( - createRecord(1, "a"), - createRecord(2, "b"), - createRecord(3, "c"), - createRecord(4, "d"))); - } - - @Test - void testFailure() throws Exception { - insert(table, 1, "a"); - insert(table, 2, "b"); - - ExpireSnapshots.builder() - .append( - infra.triggerStream(), - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - 0, - tableLoader(), - UID_SUFFIX, - StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP, - 1) - .sinkTo(infra.sink()); - - runAndWaitForFailure(infra.env(), infra.source(), infra.sink()); - - // Check the metrics. There are no expired snapshots or data files because ExpireSnapshots has - // no max age of number of snapshots set, so no files are removed. - MetricsReporterFactoryForTests.assertCounters( - new ImmutableMap.Builder, Long>() - .put( - ImmutableList.of( - DELETE_FILES_OPERATOR_NAME + "[0]", - table.name(), - DUMMY_TASK_NAME, - "0", - DELETE_FILE_FAILED_COUNTER), - 0L) - .put( - ImmutableList.of( - DELETE_FILES_OPERATOR_NAME + "[0]", - table.name(), - DUMMY_TASK_NAME, - "0", - DELETE_FILE_SUCCEEDED_COUNTER), - 0L) - .build()); - } - - @Test - void testUidAndSlotSharingGroup() { - ExpireSnapshots.builder() - .slotSharingGroup(SLOT_SHARING_GROUP) - .uidSuffix(UID_SUFFIX) - .append( - infra.triggerStream(), - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - 0, - tableLoader(), - UID_SUFFIX, - StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP, - 1) - .sinkTo(infra.sink()); - - checkUidsAreSet(infra.env(), UID_SUFFIX); - checkSlotSharingGroupsAreSet(infra.env(), SLOT_SHARING_GROUP); - } - - @Test - void testUidAndSlotSharingGroupUnset() { - ExpireSnapshots.builder() - .append( - infra.triggerStream(), - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - 0, - tableLoader(), - UID_SUFFIX, - StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP, - 1) - .sinkTo(infra.sink()); - - checkUidsAreSet(infra.env(), null); - checkSlotSharingGroupsAreSet(infra.env(), null); - } - - @Test - void testMetrics() throws Exception { - insert(table, 1, "a"); - insert(table, 2, "b"); - - ExpireSnapshots.builder() - .maxSnapshotAge(Duration.ZERO) - .retainLast(1) - .parallelism(1) - .append( - infra.triggerStream(), - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - 0, - tableLoader(), - UID_SUFFIX, - StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP, - 1) - .sinkTo(infra.sink()); - - runAndWaitForSuccess( - infra.env(), infra.source(), infra.sink(), () -> checkDeleteFinished(table.name(), 1L)); - - // Check the metrics - Awaitility.await() - .untilAsserted( - () -> - MetricsReporterFactoryForTests.assertCounters( - new ImmutableMap.Builder, Long>() - .put( - ImmutableList.of( - DELETE_FILES_OPERATOR_NAME + "[0]", - table.name(), - DUMMY_TASK_NAME, - "0", - DELETE_FILE_FAILED_COUNTER), - 0L) - .put( - ImmutableList.of( - DELETE_FILES_OPERATOR_NAME + "[0]", - table.name(), - DUMMY_TASK_NAME, - "0", - DELETE_FILE_SUCCEEDED_COUNTER), - 1L) - .build())); - } - - private static boolean checkDeleteFinished(String tableName, Long expectedDeleteNum) { - return expectedDeleteNum.equals( - MetricsReporterFactoryForTests.counter( - ImmutableList.of( - DELETE_FILES_OPERATOR_NAME + "[0]", - tableName, - DUMMY_TASK_NAME, - "0", - DELETE_FILE_SUCCEEDED_COUNTER))); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java deleted file mode 100644 index 3cb18ffbb77e..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import static org.apache.iceberg.flink.maintenance.api.JdbcLockFactory.INIT_LOCK_TABLES_PROPERTY; - -import java.util.Map; -import java.util.UUID; -import org.apache.iceberg.jdbc.JdbcCatalog; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -class TestJdbcLockFactory extends TestLockFactoryBase { - @Override - TriggerLockFactory lockFactory(String tableName) { - Map properties = Maps.newHashMap(); - properties.put(JdbcCatalog.PROPERTY_PREFIX + "username", "user"); - properties.put(JdbcCatalog.PROPERTY_PREFIX + "password", "password"); - properties.put(INIT_LOCK_TABLES_PROPERTY, "true"); - - return new JdbcLockFactory( - "jdbc:sqlite:file::memory:?ic" + UUID.randomUUID().toString().replace("-", ""), - tableName, - properties); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestLockFactoryBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestLockFactoryBase.java deleted file mode 100644 index 8a1b286ef591..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestLockFactoryBase.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -abstract class TestLockFactoryBase { - protected TriggerLockFactory lockFactory; - - abstract TriggerLockFactory lockFactory(String tableName); - - @BeforeEach - void before() { - this.lockFactory = lockFactory("tableName"); - lockFactory.open(); - } - - @AfterEach - void after() throws IOException { - lockFactory.close(); - } - - @Test - void testTryLock() { - TriggerLockFactory.Lock lock1 = lockFactory.createLock(); - TriggerLockFactory.Lock lock2 = lockFactory.createLock(); - assertThat(lock1.tryLock()).isTrue(); - assertThat(lock1.tryLock()).isFalse(); - assertThat(lock2.tryLock()).isFalse(); - } - - @Test - void testUnLock() { - TriggerLockFactory.Lock lock = lockFactory.createLock(); - assertThat(lock.tryLock()).isTrue(); - - lock.unlock(); - assertThat(lock.tryLock()).isTrue(); - } - - @Test - void testNoConflictWithRecoveryLock() { - TriggerLockFactory.Lock lock1 = lockFactory.createLock(); - TriggerLockFactory.Lock lock2 = lockFactory.createRecoveryLock(); - assertThat(lock1.tryLock()).isTrue(); - assertThat(lock2.tryLock()).isTrue(); - } - - @Test - void testDoubleUnLock() { - TriggerLockFactory.Lock lock = lockFactory.createLock(); - assertThat(lock.tryLock()).isTrue(); - - lock.unlock(); - lock.unlock(); - assertThat(lock.tryLock()).isTrue(); - assertThat(lock.tryLock()).isFalse(); - } - - @Test - void testMultiTableLock() throws IOException { - TriggerLockFactory other = lockFactory("tableName2"); - other.open(); - TriggerLockFactory.Lock lock1 = lockFactory.createLock(); - TriggerLockFactory.Lock lock2 = other.createLock(); - assertThat(lock1.tryLock()).isTrue(); - assertThat(lock2.tryLock()).isTrue(); - lock1.unlock(); - lock2.unlock(); - other.close(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestMaintenanceE2E.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestMaintenanceE2E.java deleted file mode 100644 index 0a860fec4799..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestMaintenanceE2E.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.time.Duration; -import org.apache.flink.core.execution.JobClient; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.maintenance.operator.OperatorTestBase; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -class TestMaintenanceE2E extends OperatorTestBase { - private StreamExecutionEnvironment env; - - @BeforeEach - public void beforeEach() throws IOException { - this.env = StreamExecutionEnvironment.getExecutionEnvironment(); - Table table = createTable(); - insert(table, 1, "a"); - } - - @Test - void testE2e() throws Exception { - TableMaintenance.forTable(env, tableLoader(), LOCK_FACTORY) - .uidSuffix("E2eTestUID") - .rateLimit(Duration.ofMinutes(10)) - .lockCheckDelay(Duration.ofSeconds(10)) - .add( - ExpireSnapshots.builder() - .scheduleOnCommitCount(10) - .maxSnapshotAge(Duration.ofMinutes(10)) - .retainLast(5) - .deleteBatchSize(5) - .parallelism(8)) - .add( - RewriteDataFiles.builder() - .scheduleOnDataFileCount(10) - .partialProgressEnabled(true) - .partialProgressMaxCommits(10) - .maxRewriteBytes(1000L) - .targetFileSizeBytes(1000L) - .minFileSizeBytes(1000L) - .maxFileSizeBytes(1000L) - .minInputFiles(10) - .deleteFileThreshold(10) - .rewriteAll(false) - .maxFileGroupSizeBytes(1000L)) - .append(); - - JobClient jobClient = null; - try { - jobClient = env.executeAsync(); - - // Just make sure that we are able to instantiate the flow - assertThat(jobClient).isNotNull(); - } finally { - closeJobClient(jobClient); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java deleted file mode 100644 index 795057e23538..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java +++ /dev/null @@ -1,457 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import static org.apache.iceberg.flink.SimpleDataUtil.createRecord; -import static org.apache.iceberg.flink.maintenance.api.RewriteDataFiles.COMMIT_TASK_NAME; -import static org.apache.iceberg.flink.maintenance.api.RewriteDataFiles.PLANNER_TASK_NAME; -import static org.apache.iceberg.flink.maintenance.api.RewriteDataFiles.REWRITE_TASK_NAME; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.ADDED_DATA_FILE_NUM_METRIC; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.ADDED_DATA_FILE_SIZE_METRIC; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.ERROR_COUNTER; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.REMOVED_DATA_FILE_NUM_METRIC; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.REMOVED_DATA_FILE_SIZE_METRIC; -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.List; -import java.util.stream.StreamSupport; -import org.apache.flink.streaming.api.graph.StreamGraphGenerator; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.Table; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.maintenance.operator.MetricsReporterFactoryForTests; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.Test; - -class TestRewriteDataFiles extends MaintenanceTaskTestBase { - @Test - void testRewriteUnpartitioned() throws Exception { - Table table = createTable(); - insert(table, 1, "a"); - insert(table, 2, "b"); - insert(table, 3, "c"); - insert(table, 4, "d"); - - assertFileNum(table, 4, 0); - - appendRewriteDataFiles( - RewriteDataFiles.builder() - .parallelism(2) - .deleteFileThreshold(10) - .targetFileSizeBytes(1_000_000L) - .maxFileGroupSizeBytes(10_000_000L) - .maxFileSizeBytes(2_000_000L) - .minFileSizeBytes(500_000L) - .minInputFiles(2) - .partialProgressEnabled(true) - .partialProgressMaxCommits(1) - .maxRewriteBytes(100_000L) - .rewriteAll(false)); - - runAndWaitForSuccess(infra.env(), infra.source(), infra.sink()); - - assertFileNum(table, 1, 0); - - SimpleDataUtil.assertTableRecords( - table, - ImmutableList.of( - createRecord(1, "a"), - createRecord(2, "b"), - createRecord(3, "c"), - createRecord(4, "d"))); - } - - @Test - void testRewritePartitioned() throws Exception { - Table table = createPartitionedTable(); - insertPartitioned(table, 1, "p1"); - insertPartitioned(table, 2, "p1"); - insertPartitioned(table, 3, "p2"); - insertPartitioned(table, 4, "p2"); - - assertFileNum(table, 4, 0); - - appendRewriteDataFiles(); - - runAndWaitForSuccess(infra.env(), infra.source(), infra.sink()); - - assertFileNum(table, 2, 0); - - SimpleDataUtil.assertTableRecords( - table, - ImmutableList.of( - createRecord(1, "p1"), - createRecord(2, "p1"), - createRecord(3, "p2"), - createRecord(4, "p2"))); - } - - @Test - void testPlannerFailure() throws Exception { - Table table = createTable(); - insert(table, 1, "a"); - insert(table, 2, "b"); - - assertFileNum(table, 2, 0); - - appendRewriteDataFiles(); - - runAndWaitForFailure(infra.env(), infra.source(), infra.sink()); - - // Check the metrics. The first task should be successful, but the second one should fail. This - // should be represented in the counters. - MetricsReporterFactoryForTests.assertCounters( - new ImmutableMap.Builder, Long>() - .put( - ImmutableList.of( - PLANNER_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - ERROR_COUNTER), - 1L) - .put( - ImmutableList.of( - REWRITE_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - ERROR_COUNTER), - 0L) - .put( - ImmutableList.of( - COMMIT_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - ERROR_COUNTER), - 0L) - .put( - ImmutableList.of( - COMMIT_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - ADDED_DATA_FILE_NUM_METRIC), - 1L) - .put( - ImmutableList.of( - COMMIT_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - ADDED_DATA_FILE_SIZE_METRIC), - -1L) - .put( - ImmutableList.of( - COMMIT_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - REMOVED_DATA_FILE_NUM_METRIC), - 2L) - .put( - ImmutableList.of( - COMMIT_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - REMOVED_DATA_FILE_SIZE_METRIC), - -1L) - .build()); - } - - @Test - void testUidAndSlotSharingGroup() { - createTable(); - - RewriteDataFiles.builder() - .slotSharingGroup(SLOT_SHARING_GROUP) - .uidSuffix(UID_SUFFIX) - .append( - infra.triggerStream(), - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - 0, - tableLoader(), - "OTHER", - "OTHER", - 1) - .sinkTo(infra.sink()); - - checkUidsAreSet(infra.env(), UID_SUFFIX); - checkSlotSharingGroupsAreSet(infra.env(), SLOT_SHARING_GROUP); - } - - @Test - void testUidAndSlotSharingGroupUnset() { - createTable(); - - RewriteDataFiles.builder() - .append( - infra.triggerStream(), - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - 0, - tableLoader(), - UID_SUFFIX, - StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP, - 1) - .sinkTo(infra.sink()); - - checkUidsAreSet(infra.env(), null); - checkSlotSharingGroupsAreSet(infra.env(), null); - } - - @Test - void testMetrics() throws Exception { - Table table = createTable(); - insert(table, 1, "a"); - insert(table, 2, "b"); - - assertFileNum(table, 2, 0); - - appendRewriteDataFiles(); - - runAndWaitForSuccess(infra.env(), infra.source(), infra.sink()); - - // Check the metrics - MetricsReporterFactoryForTests.assertCounters( - new ImmutableMap.Builder, Long>() - .put( - ImmutableList.of( - PLANNER_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - ERROR_COUNTER), - 0L) - .put( - ImmutableList.of( - REWRITE_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - ERROR_COUNTER), - 0L) - .put( - ImmutableList.of( - COMMIT_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - ERROR_COUNTER), - 0L) - .put( - ImmutableList.of( - COMMIT_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - ADDED_DATA_FILE_NUM_METRIC), - 1L) - .put( - ImmutableList.of( - COMMIT_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - ADDED_DATA_FILE_SIZE_METRIC), - -1L) - .put( - ImmutableList.of( - COMMIT_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - REMOVED_DATA_FILE_NUM_METRIC), - 2L) - .put( - ImmutableList.of( - COMMIT_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - REMOVED_DATA_FILE_SIZE_METRIC), - -1L) - .build()); - } - - @Test - void testV2Table() throws Exception { - Table table = createTableWithDelete(); - update(table, 1, null, "a", "b"); - update(table, 1, "b", "c"); - - assertFileNum(table, 2, 3); - SimpleDataUtil.assertTableRecords(table, ImmutableList.of(createRecord(1, "c"))); - - appendRewriteDataFiles(); - - runAndWaitForSuccess(infra.env(), infra.source(), infra.sink()); - - assertFileNum(table, 1, 1); - - SimpleDataUtil.assertTableRecords(table, ImmutableList.of(createRecord(1, "c"))); - - // Check the metrics - MetricsReporterFactoryForTests.assertCounters( - new ImmutableMap.Builder, Long>() - .put( - ImmutableList.of( - PLANNER_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - ERROR_COUNTER), - 0L) - .put( - ImmutableList.of( - REWRITE_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - ERROR_COUNTER), - 0L) - .put( - ImmutableList.of( - COMMIT_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - ERROR_COUNTER), - 0L) - .put( - ImmutableList.of( - COMMIT_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - ADDED_DATA_FILE_NUM_METRIC), - 1L) - .put( - ImmutableList.of( - COMMIT_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - ADDED_DATA_FILE_SIZE_METRIC), - -1L) - .put( - ImmutableList.of( - COMMIT_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - REMOVED_DATA_FILE_NUM_METRIC), - 2L) - .put( - ImmutableList.of( - COMMIT_TASK_NAME + "[0]", - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - "0", - REMOVED_DATA_FILE_SIZE_METRIC), - -1L) - .build()); - } - - @Test - void testRewriteWithFilter() throws Exception { - Table table = createTable(); - insert(table, 1, "a"); - insert(table, 2, "b"); - insert(table, 3, "c"); - insert(table, 4, "d"); - - assertFileNum(table, 4, 0); - - appendRewriteDataFiles( - RewriteDataFiles.builder() - .parallelism(2) - .deleteFileThreshold(10) - .targetFileSizeBytes(1_000_000L) - .maxFileGroupSizeBytes(10_000_000L) - .maxFileSizeBytes(2_000_000L) - .minFileSizeBytes(500_000L) - .minInputFiles(2) - // Only rewrite data files where id is 1 or 2 for testing rewrite - .filter(Expressions.in("id", 1, 2)) - .partialProgressEnabled(true) - .partialProgressMaxCommits(1) - .maxRewriteBytes(100_000L) - .rewriteAll(false)); - - runAndWaitForSuccess(infra.env(), infra.source(), infra.sink()); - - // There is four files, only id is 1 and 2 will be rewritten. so expect 3 files. - assertFileNum(table, 3, 0); - - SimpleDataUtil.assertTableRecords( - table, - ImmutableList.of( - createRecord(1, "a"), - createRecord(2, "b"), - createRecord(3, "c"), - createRecord(4, "d"))); - } - - private void appendRewriteDataFiles() { - appendRewriteDataFiles(RewriteDataFiles.builder().rewriteAll(true)); - } - - private void appendRewriteDataFiles(RewriteDataFiles.Builder builder) { - builder - .append( - infra.triggerStream(), - DUMMY_TABLE_NAME, - DUMMY_TASK_NAME, - 0, - tableLoader(), - UID_SUFFIX, - StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP, - 1) - .sinkTo(infra.sink()); - } - - private static void assertFileNum( - Table table, int expectedDataFileNum, int expectedDeleteFileNum) { - table.refresh(); - assertThat( - table.currentSnapshot().dataManifests(table.io()).stream() - .flatMap( - m -> - StreamSupport.stream( - ManifestFiles.read(m, table.io(), table.specs()).spliterator(), false)) - .count()) - .isEqualTo(expectedDataFileNum); - assertThat( - table.currentSnapshot().deleteManifests(table.io()).stream() - .flatMap( - m -> - StreamSupport.stream( - ManifestFiles.readDeleteManifest(m, table.io(), table.specs()) - .spliterator(), - false)) - .count()) - .isEqualTo(expectedDeleteFileNum); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFilesConfig.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFilesConfig.java deleted file mode 100644 index 665a82ea15bb..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFilesConfig.java +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.time.Duration; -import java.util.Map; -import org.apache.flink.configuration.Configuration; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.maintenance.operator.OperatorTestBase; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -public class TestRewriteDataFilesConfig extends OperatorTestBase { - private Table table; - private Map input = Maps.newHashMap(); - - @BeforeEach - public void before() { - this.table = createTable(); - input.put( - RewriteDataFilesConfig.PREFIX - + org.apache.iceberg.actions.RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, - "true"); - input.put( - RewriteDataFilesConfig.PREFIX - + org.apache.iceberg.actions.RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, - "5"); - input.put(RewriteDataFilesConfig.MAX_BYTES, "1024"); - input.put(RewriteDataFilesConfig.SCHEDULE_ON_COMMIT_COUNT, "10"); - input.put(RewriteDataFilesConfig.SCHEDULE_ON_DATA_FILE_COUNT, "20"); - input.put(RewriteDataFilesConfig.SCHEDULE_ON_DATA_FILE_SIZE, "30"); - input.put(RewriteDataFilesConfig.SCHEDULE_ON_INTERVAL_SECOND, "60"); - input.put("other.config", "should-be-ignored"); - } - - @AfterEach - public void after() { - input.clear(); - } - - @Test - void testConfigParsing() { - RewriteDataFilesConfig config = new RewriteDataFilesConfig(table, input, new Configuration()); - - assertThat(config.partialProgressEnable()).isTrue(); - assertThat(config.partialProgressMaxCommits()).isEqualTo(5); - assertThat(config.maxRewriteBytes()).isEqualTo(1024L); - assertThat(config.scheduleOnCommitCount()).isEqualTo(10); - assertThat(config.scheduleOnDataFileCount()).isEqualTo(20); - assertThat(config.scheduleOnDataFileSize()).isEqualTo(30); - assertThat(config.scheduleOnIntervalSecond()).isEqualTo(60); - } - - @Test - void testEmptyConfig() { - RewriteDataFilesConfig config = - new RewriteDataFilesConfig(table, Maps.newHashMap(), new Configuration()); - - assertThat(config.partialProgressEnable()) - .isEqualTo(org.apache.iceberg.actions.RewriteDataFiles.PARTIAL_PROGRESS_ENABLED_DEFAULT); - assertThat(config.partialProgressMaxCommits()) - .isEqualTo( - org.apache.iceberg.actions.RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS_DEFAULT); - assertThat(config.maxRewriteBytes()).isEqualTo(Long.MAX_VALUE); - assertThat(config.scheduleOnCommitCount()) - .isEqualTo(RewriteDataFilesConfig.SCHEDULE_ON_COMMIT_COUNT_OPTION.defaultValue()); - assertThat(config.scheduleOnDataFileCount()) - .isEqualTo(RewriteDataFilesConfig.SCHEDULE_ON_DATA_FILE_COUNT_OPTION.defaultValue()); - assertThat(config.scheduleOnDataFileSize()) - .isEqualTo(RewriteDataFilesConfig.SCHEDULE_ON_DATA_FILE_SIZE_OPTION.defaultValue()); - assertThat(config.scheduleOnIntervalSecond()) - .isEqualTo(RewriteDataFilesConfig.SCHEDULE_ON_INTERVAL_SECOND_OPTION.defaultValue()); - } - - @Test - void testPropertiesMethodWithAllConfigs() { - RewriteDataFilesConfig config = new RewriteDataFilesConfig(table, input, new Configuration()); - - // check the config about the rewriter - assertThat(config.partialProgressEnable()).isTrue(); - assertThat(config.partialProgressMaxCommits()).isEqualTo(5); - assertThat(config.maxRewriteBytes()).isEqualTo(1024L); - - // check the config about the schedule - assertThat(config.scheduleOnCommitCount()).isEqualTo(10); - assertThat(config.scheduleOnDataFileCount()).isEqualTo(20); - assertThat(config.scheduleOnDataFileSize()).isEqualTo(30); - assertThat(config.scheduleOnIntervalSecond()).isEqualTo(Duration.ofSeconds(60).toSeconds()); - - assertThat(config.properties()) - .doesNotContainKey("custom.option") - .containsEntry("partial-progress.enabled", "true") - .containsEntry("partial-progress.max-commits", "5") - .containsEntry("max-bytes", "1024") - .containsEntry("schedule.commit-count", "10") - .containsEntry("schedule.data-file-count", "20") - .containsEntry("schedule.data-file-size", "30") - .containsEntry("schedule.interval-second", "60"); - } - - @Test - void testPropertiesWithDefaultConfig() { - RewriteDataFilesConfig config = - new RewriteDataFilesConfig(table, Maps.newHashMap(), new Configuration()); - - // check the config about the rewriter - assertThat(config.partialProgressEnable()).isFalse(); - assertThat(config.partialProgressMaxCommits()) - .isEqualTo(RewriteDataFilesConfig.PARTIAL_PROGRESS_MAX_COMMITS_OPTION.defaultValue()); - assertThat(config.maxRewriteBytes()).isEqualTo(Long.MAX_VALUE); - - // check the config about the schedule - assertThat(config.scheduleOnCommitCount()) - .isEqualTo(RewriteDataFilesConfig.SCHEDULE_ON_COMMIT_COUNT_OPTION.defaultValue()); - assertThat(config.scheduleOnDataFileCount()) - .isEqualTo(RewriteDataFilesConfig.SCHEDULE_ON_DATA_FILE_COUNT_OPTION.defaultValue()); - assertThat(config.scheduleOnDataFileSize()) - .isEqualTo(RewriteDataFilesConfig.SCHEDULE_ON_DATA_FILE_SIZE_OPTION.defaultValue()); - assertThat(config.scheduleOnIntervalSecond()) - .isEqualTo(RewriteDataFilesConfig.SCHEDULE_ON_INTERVAL_SECOND_OPTION.defaultValue()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestTableMaintenance.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestTableMaintenance.java deleted file mode 100644 index eaa5b5e1b5b1..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestTableMaintenance.java +++ /dev/null @@ -1,462 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import static org.apache.iceberg.flink.SimpleDataUtil.createRowData; -import static org.apache.iceberg.flink.maintenance.api.TableMaintenance.LOCK_REMOVER_OPERATOR_NAME; -import static org.apache.iceberg.flink.maintenance.api.TableMaintenance.SOURCE_OPERATOR_NAME_PREFIX; -import static org.apache.iceberg.flink.maintenance.api.TableMaintenance.TRIGGER_MANAGER_OPERATOR_NAME; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.CONCURRENT_RUN_THROTTLED; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.FAILED_TASK_COUNTER; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.NOTHING_TO_TRIGGER; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.RATE_LIMITER_TRIGGERED; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.SUCCEEDED_TASK_COUNTER; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.TRIGGERED; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.io.Serializable; -import java.time.Duration; -import java.util.Collections; -import java.util.List; -import org.apache.flink.api.common.functions.MapFunction; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.dag.Transformation; -import org.apache.flink.api.java.tuple.Tuple2; -import org.apache.flink.api.java.typeutils.ResultTypeQueryable; -import org.apache.flink.configuration.CheckpointingOptions; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.core.execution.JobClient; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.transformations.SourceTransformation; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.maintenance.operator.ManualSource; -import org.apache.iceberg.flink.maintenance.operator.MetricsReporterFactoryForTests; -import org.apache.iceberg.flink.maintenance.operator.OperatorTestBase; -import org.apache.iceberg.flink.maintenance.operator.TableChange; -import org.apache.iceberg.flink.sink.FlinkSink; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -class TestTableMaintenance extends OperatorTestBase { - private static final String MAINTENANCE_TASK_NAME = "TestTableMaintenance"; - private static final String[] TASKS = - new String[] {MAINTENANCE_TASK_NAME + " [0]", MAINTENANCE_TASK_NAME + " [1]"}; - private static final TableChange DUMMY_CHANGE = TableChange.builder().commitCount(1).build(); - private static final List PROCESSED = - Collections.synchronizedList(Lists.newArrayListWithCapacity(1)); - - private StreamExecutionEnvironment env; - private Table table; - - @TempDir private File checkpointDir; - - @BeforeEach - public void beforeEach() throws IOException { - Configuration config = new Configuration(); - config.set(CheckpointingOptions.CHECKPOINT_STORAGE, "filesystem"); - config.set(CheckpointingOptions.CHECKPOINTS_DIRECTORY, "file://" + checkpointDir.getPath()); - this.env = StreamExecutionEnvironment.getExecutionEnvironment(config); - this.table = createTable(); - insert(table, 1, "a"); - - PROCESSED.clear(); - MaintenanceTaskBuilderForTest.counter = 0; - } - - @Test - void testForChangeStream() throws Exception { - ManualSource schedulerSource = - new ManualSource<>(env, TypeInformation.of(TableChange.class)); - - TableMaintenance.Builder streamBuilder = - TableMaintenance.forChangeStream(schedulerSource.dataStream(), tableLoader(), LOCK_FACTORY) - .rateLimit(Duration.ofMillis(2)) - .lockCheckDelay(Duration.ofSeconds(3)) - .add( - new MaintenanceTaskBuilderForTest(true) - .scheduleOnCommitCount(1) - .scheduleOnDataFileCount(2) - .scheduleOnDataFileSize(3L) - .scheduleOnEqDeleteFileCount(4) - .scheduleOnEqDeleteRecordCount(5L) - .scheduleOnPosDeleteFileCount(6) - .scheduleOnPosDeleteRecordCount(7L) - .scheduleOnInterval(Duration.ofHours(1))); - - sendEvents(schedulerSource, streamBuilder, ImmutableList.of(Tuple2.of(DUMMY_CHANGE, 1))); - } - - @Test - void testForTable() throws Exception { - TableLoader tableLoader = tableLoader(); - - env.enableCheckpointing(10); - - TableMaintenance.forTable(env, tableLoader, LOCK_FACTORY) - .rateLimit(Duration.ofMillis(2)) - .maxReadBack(2) - .add(new MaintenanceTaskBuilderForTest(true).scheduleOnCommitCount(2)) - .append(); - - // Creating a stream for inserting data into the table concurrently - ManualSource insertSource = - new ManualSource<>(env, InternalTypeInfo.of(FlinkSchemaUtil.convert(table.schema()))); - FlinkSink.forRowData(insertSource.dataStream()) - .tableLoader(tableLoader) - .uidPrefix(UID_SUFFIX + "-iceberg-sink") - .append(); - - JobClient jobClient = null; - try { - jobClient = env.executeAsync(); - - insertSource.sendRecord(createRowData(2, "b")); - - Awaitility.await().until(() -> PROCESSED.size() == 1); - } finally { - closeJobClient(jobClient); - } - } - - @Test - void testLocking() throws Exception { - TriggerLockFactory.Lock lock = LOCK_FACTORY.createLock(); - - ManualSource schedulerSource = - new ManualSource<>(env, TypeInformation.of(TableChange.class)); - - TableMaintenance.Builder streamBuilder = - TableMaintenance.forChangeStream(schedulerSource.dataStream(), tableLoader(), LOCK_FACTORY) - .rateLimit(Duration.ofMillis(2)) - .add(new MaintenanceTaskBuilderForTest(true).scheduleOnCommitCount(1)); - - assertThat(lock.isHeld()).isFalse(); - sendEvents(schedulerSource, streamBuilder, ImmutableList.of(Tuple2.of(DUMMY_CHANGE, 1))); - - assertThat(lock.isHeld()).isFalse(); - } - - @Test - void testMetrics() throws Exception { - ManualSource schedulerSource = - new ManualSource<>(env, TypeInformation.of(TableChange.class)); - - TableMaintenance.Builder streamBuilder = - TableMaintenance.forChangeStream(schedulerSource.dataStream(), tableLoader(), LOCK_FACTORY) - .rateLimit(Duration.ofMillis(2)) - .lockCheckDelay(Duration.ofMillis(2)) - .add(new MaintenanceTaskBuilderForTest(true).scheduleOnCommitCount(1)) - .add(new MaintenanceTaskBuilderForTest(false).scheduleOnCommitCount(2)); - - sendEvents( - schedulerSource, - streamBuilder, - ImmutableList.of(Tuple2.of(DUMMY_CHANGE, 1), Tuple2.of(DUMMY_CHANGE, 2))); - - Awaitility.await() - .until( - () -> - MetricsReporterFactoryForTests.counter( - ImmutableList.of( - LOCK_REMOVER_OPERATOR_NAME, - table.name(), - TASKS[0], - "0", - SUCCEEDED_TASK_COUNTER)) - .equals(2L)); - - MetricsReporterFactoryForTests.assertCounters( - new ImmutableMap.Builder, Long>() - .put( - ImmutableList.of( - LOCK_REMOVER_OPERATOR_NAME, - table.name(), - TASKS[0], - "0", - SUCCEEDED_TASK_COUNTER), - 2L) - .put( - ImmutableList.of( - LOCK_REMOVER_OPERATOR_NAME, table.name(), TASKS[0], "0", FAILED_TASK_COUNTER), - 0L) - .put( - ImmutableList.of( - TRIGGER_MANAGER_OPERATOR_NAME, table.name(), TASKS[0], "0", TRIGGERED), - 2L) - .put( - ImmutableList.of( - LOCK_REMOVER_OPERATOR_NAME, - table.name(), - TASKS[1], - "1", - SUCCEEDED_TASK_COUNTER), - 0L) - .put( - ImmutableList.of( - LOCK_REMOVER_OPERATOR_NAME, table.name(), TASKS[1], "1", FAILED_TASK_COUNTER), - 1L) - .put( - ImmutableList.of( - TRIGGER_MANAGER_OPERATOR_NAME, table.name(), TASKS[1], "1", TRIGGERED), - 1L) - .put( - ImmutableList.of(TRIGGER_MANAGER_OPERATOR_NAME, table.name(), NOTHING_TO_TRIGGER), - -1L) - .put( - ImmutableList.of( - TRIGGER_MANAGER_OPERATOR_NAME, table.name(), CONCURRENT_RUN_THROTTLED), - -1L) - .put( - ImmutableList.of( - TRIGGER_MANAGER_OPERATOR_NAME, table.name(), RATE_LIMITER_TRIGGERED), - -1L) - .build()); - } - - @Test - void testUidAndSlotSharingGroup() throws IOException { - TableMaintenance.forChangeStream( - new ManualSource<>(env, TypeInformation.of(TableChange.class)).dataStream(), - tableLoader(), - LOCK_FACTORY) - .uidSuffix(UID_SUFFIX) - .slotSharingGroup(SLOT_SHARING_GROUP) - .add( - new MaintenanceTaskBuilderForTest(true) - .scheduleOnCommitCount(1) - .uidSuffix(UID_SUFFIX) - .slotSharingGroup(SLOT_SHARING_GROUP)) - .append(); - - checkUidsAreSet(env, UID_SUFFIX); - checkSlotSharingGroupsAreSet(env, SLOT_SHARING_GROUP); - } - - @Test - void testUidAndSlotSharingGroupUnset() throws IOException { - TableMaintenance.forChangeStream( - new ManualSource<>(env, TypeInformation.of(TableChange.class)).dataStream(), - tableLoader(), - LOCK_FACTORY) - .add(new MaintenanceTaskBuilderForTest(true).scheduleOnCommitCount(1)) - .append(); - - checkUidsAreSet(env, null); - checkSlotSharingGroupsAreSet(env, null); - } - - @Test - void testUidAndSlotSharingGroupInherit() throws IOException { - TableMaintenance.forChangeStream( - new ManualSource<>(env, TypeInformation.of(TableChange.class)).dataStream(), - tableLoader(), - LOCK_FACTORY) - .uidSuffix(UID_SUFFIX) - .slotSharingGroup(SLOT_SHARING_GROUP) - .add(new MaintenanceTaskBuilderForTest(true).scheduleOnCommitCount(1)) - .append(); - - checkUidsAreSet(env, UID_SUFFIX); - checkSlotSharingGroupsAreSet(env, SLOT_SHARING_GROUP); - } - - @Test - void testUidAndSlotSharingGroupOverWrite() throws IOException { - String anotherUid = "Another-UID"; - String anotherSlotSharingGroup = "Another-SlotSharingGroup"; - TableMaintenance.forChangeStream( - new ManualSource<>(env, TypeInformation.of(TableChange.class)).dataStream(), - tableLoader(), - LOCK_FACTORY) - .uidSuffix(UID_SUFFIX) - .slotSharingGroup(SLOT_SHARING_GROUP) - .add( - new MaintenanceTaskBuilderForTest(true) - .scheduleOnCommitCount(1) - .uidSuffix(anotherUid) - .slotSharingGroup(anotherSlotSharingGroup)) - .append(); - - // Choose an operator from the scheduler part of the graph - Transformation schedulerTransformation = - env.getTransformations().stream() - .filter(t -> t.getName().equals("Trigger manager")) - .findFirst() - .orElseThrow(); - assertThat(schedulerTransformation.getUid()).contains(UID_SUFFIX); - assertThat(schedulerTransformation.getSlotSharingGroup()).isPresent(); - assertThat(schedulerTransformation.getSlotSharingGroup().get().getName()) - .isEqualTo(SLOT_SHARING_GROUP); - - // Choose an operator from the maintenance task part of the graph - Transformation scheduledTransformation = - env.getTransformations().stream() - .filter(t -> t.getName().startsWith(MAINTENANCE_TASK_NAME)) - .findFirst() - .orElseThrow(); - assertThat(scheduledTransformation.getUid()).contains(anotherUid); - assertThat(scheduledTransformation.getSlotSharingGroup()).isPresent(); - assertThat(scheduledTransformation.getSlotSharingGroup().get().getName()) - .isEqualTo(anotherSlotSharingGroup); - } - - @Test - void testUidAndSlotSharingGroupForMonitorSource() throws IOException { - TableMaintenance.forTable(env, tableLoader(), LOCK_FACTORY) - .uidSuffix(UID_SUFFIX) - .slotSharingGroup(SLOT_SHARING_GROUP) - .add( - new MaintenanceTaskBuilderForTest(true) - .scheduleOnCommitCount(1) - .uidSuffix(UID_SUFFIX) - .slotSharingGroup(SLOT_SHARING_GROUP)) - .append(); - - Transformation source = monitorSource(); - assertThat(source).isNotNull(); - assertThat(source.getUid()).contains(UID_SUFFIX); - assertThat(source.getSlotSharingGroup()).isPresent(); - assertThat(source.getSlotSharingGroup().get().getName()).isEqualTo(SLOT_SHARING_GROUP); - - checkUidsAreSet(env, UID_SUFFIX); - checkSlotSharingGroupsAreSet(env, SLOT_SHARING_GROUP); - } - - /** - * Sends the events though the {@link ManualSource} provided, and waits until the given number of - * records are processed. - * - * @param schedulerSource used for sending the events - * @param streamBuilder used for generating the job - * @param eventsAndResultNumbers the pair of the event and the expected processed records - * @throws Exception if any - */ - private void sendEvents( - ManualSource schedulerSource, - TableMaintenance.Builder streamBuilder, - List> eventsAndResultNumbers) - throws Exception { - streamBuilder.append(); - - JobClient jobClient = null; - try { - jobClient = env.executeAsync(); - - eventsAndResultNumbers.forEach( - eventsAndResultNumber -> { - int expectedSize = PROCESSED.size() + eventsAndResultNumber.f1; - schedulerSource.sendRecord(eventsAndResultNumber.f0); - Awaitility.await() - .until( - () -> PROCESSED.size() == expectedSize && !LOCK_FACTORY.createLock().isHeld()); - }); - } finally { - closeJobClient(jobClient); - } - } - - /** - * Finds the {@link org.apache.iceberg.flink.maintenance.operator.MonitorSource} for testing - * purposes by parsing the transformation tree. - * - * @return The monitor source if we found it - */ - private Transformation monitorSource() { - assertThat(env.getTransformations()).isNotEmpty(); - assertThat(env.getTransformations().get(0).getInputs()).isNotEmpty(); - assertThat(env.getTransformations().get(0).getInputs().get(0).getInputs()).isNotEmpty(); - - Transformation result = - env.getTransformations().get(0).getInputs().get(0).getInputs().get(0); - - // Some checks to make sure this is the transformation we are looking for - assertThat(result).isInstanceOf(SourceTransformation.class); - assertThat(result.getName()).startsWith(SOURCE_OPERATOR_NAME_PREFIX); - - return result; - } - - private static class MaintenanceTaskBuilderForTest - extends MaintenanceTaskBuilder { - private final boolean success; - private final int id; - private static int counter = 0; - - MaintenanceTaskBuilderForTest(boolean success) { - this.success = success; - this.id = counter; - ++counter; - } - - @Override - String maintenanceTaskName() { - return MAINTENANCE_TASK_NAME; - } - - @Override - DataStream append(DataStream trigger) { - String name = TASKS[id]; - return trigger - .map(new DummyMaintenanceTask(success)) - .name(name) - .uid(uidSuffix() + "-test-mapper-" + name + "-" + id) - .slotSharingGroup(slotSharingGroup()) - .forceNonParallel(); - } - } - - private static class DummyMaintenanceTask - implements MapFunction, ResultTypeQueryable, Serializable { - private final boolean success; - - private DummyMaintenanceTask(boolean success) { - this.success = success; - } - - @Override - public TaskResult map(Trigger trigger) { - // Ensure that the lock is held when processing - assertThat(LOCK_FACTORY.createLock().isHeld()).isTrue(); - PROCESSED.add(trigger); - - return new TaskResult( - trigger.taskId(), - trigger.timestamp(), - success, - success ? Collections.emptyList() : Lists.newArrayList(new Exception("Testing error"))); - } - - @Override - public TypeInformation getProducedType() { - return TypeInformation.of(TaskResult.class); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestZkLockFactory.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestZkLockFactory.java deleted file mode 100644 index f1313c89ae53..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestZkLockFactory.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.api; - -import java.io.IOException; -import org.apache.curator.test.TestingServer; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; - -public class TestZkLockFactory extends TestLockFactoryBase { - - private TestingServer zkTestServer; - - @Override - TriggerLockFactory lockFactory(String tableName) { - return new ZkLockFactory(zkTestServer.getConnectString(), tableName, 5000, 3000, 1000, 3); - } - - @BeforeEach - @Override - void before() { - try { - zkTestServer = new TestingServer(); - } catch (Exception e) { - throw new RuntimeException(e); - } - - super.before(); - } - - @AfterEach - public void after() throws IOException { - super.after(); - if (zkTestServer != null) { - zkTestServer.close(); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java deleted file mode 100644 index e7e818ba6887..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import java.time.Duration; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.BlockingQueue; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; -import java.util.concurrent.atomic.AtomicInteger; -import org.apache.flink.api.connector.sink2.Sink; -import org.apache.flink.api.connector.sink2.SinkWriter; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -/** Sink for collecting output during testing. */ -public class CollectingSink implements Sink { - private static final long serialVersionUID = 1L; - private static final List> QUEUES = - Collections.synchronizedList(Lists.newArrayListWithExpectedSize(1)); - private static final AtomicInteger NUM_SINKS = new AtomicInteger(-1); - private final int index; - - /** Creates a new sink which collects the elements received. */ - public CollectingSink() { - this.index = NUM_SINKS.incrementAndGet(); - QUEUES.add(new LinkedBlockingQueue<>()); - } - - /** - * Gets all the remaining output received by this {@link Sink}. - * - * @return all the remaining output - */ - List remainingOutput() { - return Lists.newArrayList((BlockingQueue) QUEUES.get(this.index)); - } - - /** - * Check if there is no remaining output received by this {@link Sink}. - * - * @return true if there is no remaining output - */ - boolean isEmpty() { - return QUEUES.get(this.index).isEmpty(); - } - - /** - * Wait until the next element received by the {@link Sink}. - * - * @param timeout for the poll - * @return The first element received by this {@link Sink} - * @throws TimeoutException if no element received until the timeout - */ - public T poll(Duration timeout) throws TimeoutException { - Object element; - - try { - element = QUEUES.get(this.index).poll(timeout.toMillis(), TimeUnit.MILLISECONDS); - } catch (InterruptedException var4) { - throw new RuntimeException(var4); - } - - if (element == null) { - throw new TimeoutException(); - } else { - return (T) element; - } - } - - @Override - public SinkWriter createWriter(InitContext context) { - return new CollectingWriter<>(index); - } - - private static class CollectingWriter implements SinkWriter { - private final int index; - - CollectingWriter(int index) { - this.index = index; - } - - @Override - public void write(T element, Context context) { - QUEUES.get(index).add(element); - } - - @Override - public void flush(boolean endOfInput) { - // Nothing to do here - } - - @Override - public void close() { - // Nothing to do here - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java deleted file mode 100644 index eff32fcfa118..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java +++ /dev/null @@ -1,320 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import java.util.ArrayDeque; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.CompletableFuture; -import org.apache.flink.api.common.eventtime.Watermark; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.connector.source.Boundedness; -import org.apache.flink.api.connector.source.ReaderOutput; -import org.apache.flink.api.connector.source.Source; -import org.apache.flink.api.connector.source.SourceReader; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.api.connector.source.SourceSplit; -import org.apache.flink.api.connector.source.SplitEnumerator; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.flink.api.java.tuple.Tuple2; -import org.apache.flink.api.java.typeutils.ResultTypeQueryable; -import org.apache.flink.core.io.InputStatus; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Queues; -import org.jetbrains.annotations.Nullable; - -/** Testing source implementation for Flink sources which can be triggered manually. */ -public class ManualSource - implements Source, - ResultTypeQueryable { - - private static final long serialVersionUID = 1L; - private static final List>> QUEUES = - Collections.synchronizedList(Lists.newArrayList()); - private static final List> AVAILABILITIES = - Collections.synchronizedList(Lists.newArrayList()); - private static int numSources = 0; - private final TypeInformation type; - private final int index; - private transient DataStream stream; - private final transient StreamExecutionEnvironment env; - - /** - * Creates a new source for testing. - * - * @param env to register the source - * @param type of the events returned by the source - */ - public ManualSource(StreamExecutionEnvironment env, TypeInformation type) { - this.type = type; - this.env = env; - this.index = numSources++; - QUEUES.add(Queues.newArrayDeque()); - AVAILABILITIES.add(new CompletableFuture<>()); - } - - /** - * Emit a new record from the source. - * - * @param event to emit - */ - public void sendRecord(T event) { - this.sendInternal(Tuple2.of(event, null)); - } - - /** - * Emit a new record with the given event time from the source. - * - * @param event to emit - * @param eventTime of the event - */ - public void sendRecord(T event, long eventTime) { - this.sendInternal(Tuple2.of(event, eventTime)); - } - - /** - * Emit a watermark from the source. - * - * @param timeStamp of the watermark - */ - public void sendWatermark(long timeStamp) { - this.sendInternal(Tuple2.of(null, timeStamp)); - } - - /** Mark the source as finished. */ - void markFinished() { - this.sendWatermark(Long.MAX_VALUE); - this.sendInternal(Tuple2.of(null, null)); - } - - /** - * Get the {@link DataStream} for this source. - * - * @return the stream emitted by this source - */ - public DataStream dataStream() { - if (this.stream == null) { - this.stream = - this.env - .fromSource(this, WatermarkStrategy.noWatermarks(), "ManualSource-" + index, type) - .forceNonParallel(); - } - - return this.stream; - } - - private void sendInternal(Tuple2 tuple) { - QUEUES.get(index).offer(tuple); - AVAILABILITIES.get(index).complete(null); - } - - @Override - public Boundedness getBoundedness() { - return Boundedness.CONTINUOUS_UNBOUNDED; - } - - @Override - public SplitEnumerator createEnumerator( - SplitEnumeratorContext enumContext) { - return new DummyCheckpointEnumerator(); - } - - @Override - public SplitEnumerator restoreEnumerator( - SplitEnumeratorContext enumContext, DummyCheckpoint checkpoint) { - return new DummyCheckpointEnumerator(); - } - - @Override - public SimpleVersionedSerializer getSplitSerializer() { - return new NoOpDummySplitSerializer(); - } - - @Override - public SimpleVersionedSerializer getEnumeratorCheckpointSerializer() { - return new NoOpDummyCheckpointSerializer(); - } - - @Override - public SourceReader createReader(SourceReaderContext sourceReaderContext) { - return new SourceReader<>() { - @Override - public void start() { - // Do nothing - } - - @SuppressWarnings("unchecked") - @Override - public InputStatus pollNext(ReaderOutput output) { - Tuple2 next = (Tuple2) QUEUES.get(index).poll(); - - if (next != null) { - if (next.f0 == null) { - if (next.f1 == null) { - // No more input - return InputStatus.END_OF_INPUT; - } else { - output.emitWatermark(new Watermark(next.f1)); - } - } else if (next.f1 == null) { - // No event time set - output.collect(next.f0); - } else { - // With event time - output.collect(next.f0, next.f1); - } - } - - AVAILABILITIES.set(index, new CompletableFuture<>()); - return QUEUES.get(index).isEmpty() - ? InputStatus.NOTHING_AVAILABLE - : InputStatus.MORE_AVAILABLE; - } - - @Override - public List snapshotState(long checkpointId) { - return Lists.newArrayList(new DummySplit()); - } - - @Override - public CompletableFuture isAvailable() { - return AVAILABILITIES.get(index); - } - - @Override - public void addSplits(List splits) { - // do nothing - } - - @Override - public void notifyNoMoreSplits() { - // do nothing - } - - @Override - public void close() { - // do nothing - } - }; - } - - @Override - public TypeInformation getProducedType() { - return this.type; - } - - /** - * Placeholder because the ManualSource itself implicitly represents the only split and does not - * require an actual split object. - */ - public static class DummySplit implements SourceSplit { - @Override - public String splitId() { - return "dummy"; - } - } - - /** - * Placeholder because the ManualSource does not support fault-tolerance and thus does not require - * actual checkpointing. - */ - public static class DummyCheckpoint {} - - /** Placeholder because the ManualSource does not need enumeration, but checkpointing needs it. */ - private static class DummyCheckpointEnumerator - implements SplitEnumerator { - - @Override - public void start() { - // do nothing - } - - @Override - public void handleSplitRequest(int subtaskId, @Nullable String requesterHostname) { - // do nothing - } - - @Override - public void addSplitsBack(List splits, int subtaskId) { - // do nothing - } - - @Override - public void addReader(int subtaskId) { - // do nothing - } - - @Override - public DummyCheckpoint snapshotState(long checkpointId) { - return new DummyCheckpoint(); - } - - @Override - public void close() { - // do nothing - } - } - - /** - * Not used - only required to avoid NullPointerException. The split is not transferred from the - * enumerator, it is implicitly represented by the ManualSource. - */ - private static class NoOpDummySplitSerializer implements SimpleVersionedSerializer { - @Override - public int getVersion() { - return 0; - } - - @Override - public byte[] serialize(DummySplit split) { - return new byte[0]; - } - - @Override - public DummySplit deserialize(int version, byte[] serialized) { - return new DummySplit(); - } - } - - /** - * Not used - only required to avoid NullPointerException. The split is not transferred from the - * enumerator, it is implicitly represented by the ManualSource. - */ - private static class NoOpDummyCheckpointSerializer - implements SimpleVersionedSerializer { - @Override - public int getVersion() { - return 0; - } - - @Override - public byte[] serialize(DummyCheckpoint split) { - return new byte[0]; - } - - @Override - public DummyCheckpoint deserialize(int version, byte[] serialized) { - return new DummyCheckpoint(); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/MetricsReporterFactoryForTests.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/MetricsReporterFactoryForTests.java deleted file mode 100644 index ed66ff3df076..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/MetricsReporterFactoryForTests.java +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.stream.Collectors; -import org.apache.flink.metrics.Counter; -import org.apache.flink.metrics.Gauge; -import org.apache.flink.metrics.Metric; -import org.apache.flink.metrics.MetricConfig; -import org.apache.flink.metrics.MetricGroup; -import org.apache.flink.metrics.reporter.MetricReporter; -import org.apache.flink.metrics.reporter.MetricReporterFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -public class MetricsReporterFactoryForTests implements MetricReporterFactory { - private static final TestMetricsReporter INSTANCE = new TestMetricsReporter(); - private static final Pattern TASK_METRIC_NAME = - Pattern.compile( - "\\.taskmanager\\.[^.]+\\.[^.]+\\.([^.]+)\\.\\d+\\." - + TableMaintenanceMetrics.GROUP_KEY - + "\\." - + TableMaintenanceMetrics.TABLE_NAME_KEY - + "\\.([^.]+)\\." - + TableMaintenanceMetrics.TASK_NAME_KEY - + "\\.([^.]+)\\." - + TableMaintenanceMetrics.TASK_INDEX_KEY - + "\\.([^.]+)\\.([^.]+)"); - - private static final Pattern MAIN_METRIC_NAME = - Pattern.compile( - "\\.taskmanager\\.[^.]+\\.[^.]+\\.([^.]+)\\.\\d+\\." - + TableMaintenanceMetrics.GROUP_KEY - + "\\." - + TableMaintenanceMetrics.TABLE_NAME_KEY - + "\\.([^.]+)\\.([^.]+)"); - - private static Map counters = Maps.newConcurrentMap(); - private static Map gauges = Maps.newConcurrentMap(); - private static Set monitoredMetricNames; - - public MetricsReporterFactoryForTests() { - monitoredMetricNames = - Arrays.stream(TableMaintenanceMetrics.class.getDeclaredFields()) - .map( - f -> { - try { - return f.get(null).toString(); - } catch (IllegalAccessException e) { - throw new RuntimeException(e); - } - }) - .collect(Collectors.toSet()); - } - - @Override - public MetricReporter createMetricReporter(Properties properties) { - return INSTANCE; - } - - public static void reset() { - counters = Maps.newConcurrentMap(); - gauges = Maps.newConcurrentMap(); - } - - public static Long counter(List parts) { - return counterValues().get(longName(parts)); - } - - public static Long gauge(List parts) { - return gaugeValues().get(longName(parts)); - } - - public static void assertGauges(Map, Long> expected) { - Map transformed = - expected.entrySet().stream() - .collect(Collectors.toMap(k -> longName(k.getKey()), Map.Entry::getValue)); - assertThat(filter(gaugeValues(), transformed)).isEqualTo(filter(transformed, transformed)); - } - - public static void assertCounters(Map, Long> expected) { - Map transformed = - expected.entrySet().stream() - .collect(Collectors.toMap(k -> longName(k.getKey()), Map.Entry::getValue)); - assertThat(filter(counterValues(), transformed)).isEqualTo(filter(transformed, transformed)); - } - - private static Map gaugeValues() { - return gauges.entrySet().stream() - .collect( - Collectors.toMap( - entry -> longName(entry.getKey()), entry -> (Long) entry.getValue().getValue())); - } - - private static Map counterValues() { - return counters.entrySet().stream() - .collect( - Collectors.toMap( - entry -> longName(entry.getKey()), entry -> entry.getValue().getCount())); - } - - private static Map filter(Map original, Map filter) { - return original.entrySet().stream() - .filter( - entry -> { - Long filterValue = filter.get(entry.getKey()); - return filterValue == null || filterValue != -1; - }) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); - } - - private static String longName(String fullName) { - Matcher mainMatcher = MAIN_METRIC_NAME.matcher(fullName); - Matcher taskMatcher = TASK_METRIC_NAME.matcher(fullName); - - if (taskMatcher.matches()) { - return taskMatcher.group(1) - + "." - + taskMatcher.group(2) - + "." - + taskMatcher.group(3) - + "." - + taskMatcher.group(4) - + "." - + taskMatcher.group(5); - } - - if (mainMatcher.matches()) { - return mainMatcher.group(1) + "." + mainMatcher.group(2) + "." + mainMatcher.group(3); - } - - throw new RuntimeException(String.format("Can't parse simplified metrics name %s", fullName)); - } - - private static String longName(List parts) { - return parts.stream().map(s -> s.replaceAll("\\.", "_")).collect(Collectors.joining(".")); - } - - private static class TestMetricsReporter implements MetricReporter { - @Override - public void open(MetricConfig config) { - // do nothing - } - - @Override - public void close() { - // do nothing - } - - @Override - public void notifyOfAddedMetric(Metric metric, String metricName, MetricGroup group) { - if (monitoredMetricNames.contains(metricName)) { - if (metric instanceof Counter) { - counters.put(group.getMetricIdentifier(metricName), (Counter) metric); - } - - if (metric instanceof Gauge) { - gauges.put(group.getMetricIdentifier(metricName), (Gauge) metric); - } - } - } - - @Override - public void notifyOfRemovedMetric(Metric metric, String metricName, MetricGroup group) { - // do nothing - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java deleted file mode 100644 index 8460b392e278..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java +++ /dev/null @@ -1,400 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.MetricOptions; -import org.apache.flink.core.execution.JobClient; -import org.apache.flink.core.execution.SavepointFormatType; -import org.apache.flink.runtime.jobgraph.SavepointConfigOptions; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.graph.StreamGraphGenerator; -import org.apache.flink.streaming.api.transformations.SinkTransformation; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Files; -import org.apache.iceberg.PartitionData; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.TestHelpers; -import org.apache.iceberg.data.FileHelpers; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.deletes.PositionDelete; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.apache.iceberg.flink.maintenance.api.TriggerLockFactory; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -public class OperatorTestBase { - private static final int NUMBER_TASK_MANAGERS = 1; - private static final int SLOTS_PER_TASK_MANAGER = 8; - private static final Schema SCHEMA_WITH_PRIMARY_KEY = - new Schema( - Lists.newArrayList( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())), - ImmutableMap.of(), - ImmutableSet.of(SimpleDataUtil.SCHEMA.columns().get(0).fieldId())); - - protected static final String UID_SUFFIX = "UID-Dummy"; - protected static final String SLOT_SHARING_GROUP = "SlotSharingGroup"; - protected static final TriggerLockFactory LOCK_FACTORY = new MemoryLockFactory(); - - public static final String IGNORED_OPERATOR_NAME = "Ignore"; - - static final long EVENT_TIME = 10L; - static final long EVENT_TIME_2 = 11L; - static final Watermark WATERMARK = new Watermark(EVENT_TIME); - protected static final String DUMMY_TASK_NAME = "dummyTask"; - protected static final String DUMMY_TABLE_NAME = "dummyTable"; - - static final String FILE_NAME_1 = "fileName1"; - static final String FILE_NAME_2 = "fileName2"; - static final Watermark WATERMARK_2 = new Watermark(EVENT_TIME_2); - - @RegisterExtension - protected static final MiniClusterExtension MINI_CLUSTER_EXTENSION = - new MiniClusterExtension( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(NUMBER_TASK_MANAGERS) - .setNumberSlotsPerTaskManager(SLOTS_PER_TASK_MANAGER) - .setConfiguration(config()) - .build()); - - @TempDir private Path warehouseDir; - - @RegisterExtension - private static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); - - @BeforeEach - void before() { - LOCK_FACTORY.open(); - LOCK_FACTORY.createLock().unlock(); - LOCK_FACTORY.createRecoveryLock().unlock(); - MetricsReporterFactoryForTests.reset(); - } - - @AfterEach - void after() throws IOException { - LOCK_FACTORY.close(); - } - - protected static Table createTable() { - // only test V2 tables as compaction doesn't support V3 with row lineage - return createTable("2"); - } - - protected static Table createTable(String formatVersion) { - return CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - PartitionSpec.unpartitioned(), - null, - ImmutableMap.of( - TableProperties.FORMAT_VERSION, - formatVersion, - "flink.max-continuous-empty-commits", - "100000")); - } - - protected static Table createTableWithDelete() { - return CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SCHEMA_WITH_PRIMARY_KEY, - PartitionSpec.unpartitioned(), - null, - ImmutableMap.of("format-version", "2", "write.upsert.enabled", "true")); - } - - protected static Table createPartitionedTable() { - return CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build(), - null, - ImmutableMap.of("format-version", "2", "flink.max-continuous-empty-commits", "100000")); - } - - protected void insert(Table table, Integer id, String data) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) - .appendToTable(Lists.newArrayList(SimpleDataUtil.createRecord(id, data))); - table.refresh(); - } - - protected void insert(Table table, Integer id, String data, String extra) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) - .appendToTable(Lists.newArrayList(SimpleDataUtil.createRecord(id, data, extra))); - table.refresh(); - } - - /** - * For the same identifier column id this methods simulate the following row operations:

  • - *
  • add an equality delete on oldData - *
  • insert newData
  • - * - * @param table to modify - * @param id the identifier column id - * @param oldData the old data to be deleted - * @param newData the new data to be inserted - */ - protected void update(Table table, Integer id, String oldData, String newData) - throws IOException { - DataFile dataFile = - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) - .writeFile(Lists.newArrayList(SimpleDataUtil.createRecord(id, newData))); - DeleteFile eqDelete = writeEqualityDelete(table, id, oldData); - - table.newRowDelta().addRows(dataFile).addDeletes(eqDelete).commit(); - } - - /** - * For the same identifier column id this methods simulate the following row operations: - *
  • add an equality delete on oldData - *
  • insert tempData - *
  • add a position delete on tempData - *
  • insert newData
  • - * - * @param table to modify - * @param id the identifier column id - * @param oldData the old data to be deleted - * @param tempData the temp data to be inserted and deleted with a position delete - * @param newData the new data to be inserted - */ - protected void update(Table table, Integer id, String oldData, String tempData, String newData) - throws IOException { - DataFile dataFile = - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) - .writeFile( - Lists.newArrayList( - SimpleDataUtil.createRecord(id, tempData), - SimpleDataUtil.createRecord(id, newData))); - DeleteFile eqDelete = writeEqualityDelete(table, id, oldData); - DeleteFile posDelete = writePosDelete(table, dataFile.path(), 0, id, tempData); - - table.newRowDelta().addRows(dataFile).addDeletes(eqDelete).addDeletes(posDelete).commit(); - } - - protected void insertPartitioned(Table table, Integer id, String data) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) - .appendToTable( - TestHelpers.Row.of(data), Lists.newArrayList(SimpleDataUtil.createRecord(id, data))); - table.refresh(); - } - - protected void insertFullPartitioned(Table table, Integer id, String data) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) - .appendToTable( - TestHelpers.Row.of(data, id), - Lists.newArrayList(SimpleDataUtil.createRecord(id, data))); - table.refresh(); - } - - protected void dropTable() { - CATALOG_EXTENSION.catalogLoader().loadCatalog().dropTable(TestFixtures.TABLE_IDENTIFIER); - } - - protected TableLoader tableLoader() { - return CATALOG_EXTENSION.tableLoader(); - } - - /** - * Close the {@link JobClient} and wait for the job closure. If the savepointDir is specified, it - * stops the job with a savepoint. - * - * @param jobClient the job to close - * @param savepointDir the savepointDir to store the last savepoint. If null then - * stop without a savepoint. - * @return configuration for restarting the job from the savepoint - */ - protected static Configuration closeJobClient(JobClient jobClient, File savepointDir) { - Configuration conf = new Configuration(); - if (jobClient != null) { - if (savepointDir != null) { - // Stop with savepoint - jobClient.stopWithSavepoint(false, savepointDir.getPath(), SavepointFormatType.CANONICAL); - // Wait until the savepoint is created and the job has been stopped - Awaitility.await().until(() -> savepointDir.listFiles(File::isDirectory).length == 1); - conf.set( - SavepointConfigOptions.SAVEPOINT_PATH, - savepointDir.listFiles(File::isDirectory)[0].getAbsolutePath()); - } else { - jobClient.cancel(); - } - - // Wait until the job has been stopped - Awaitility.await().until(() -> jobClient.getJobStatus().get().isTerminalState()); - return conf; - } - - return null; - } - - /** - * Close the {@link JobClient} and wait for the job closure. - * - * @param jobClient the job to close - */ - protected static void closeJobClient(JobClient jobClient) { - closeJobClient(jobClient, null); - } - - protected static void checkUidsAreSet(StreamExecutionEnvironment env, String uidSuffix) { - env.getTransformations().stream() - .filter( - t -> !(t instanceof SinkTransformation) && !(t.getName().equals(IGNORED_OPERATOR_NAME))) - .forEach( - transformation -> { - assertThat(transformation.getUid()).isNotNull(); - if (uidSuffix != null) { - assertThat(transformation.getUid()).contains(UID_SUFFIX); - } - }); - } - - protected static void checkSlotSharingGroupsAreSet(StreamExecutionEnvironment env, String name) { - String nameToCheck = name != null ? name : StreamGraphGenerator.DEFAULT_SLOT_SHARING_GROUP; - - env.getTransformations().stream() - .filter( - t -> !(t instanceof SinkTransformation) && !(t.getName().equals(IGNORED_OPERATOR_NAME))) - .forEach( - t -> { - assertThat(t.getSlotSharingGroup()).isPresent(); - assertThat(t.getSlotSharingGroup().get().getName()).isEqualTo(nameToCheck); - }); - } - - private static Configuration config() { - Configuration config = new Configuration(DISABLE_CLASSLOADER_CHECK_CONFIG); - MetricOptions.forReporter(config, "test_reporter") - .set(MetricOptions.REPORTER_FACTORY_CLASS, MetricsReporterFactoryForTests.class.getName()); - return config; - } - - private DeleteFile writeEqualityDelete(Table table, Integer id, String oldData) - throws IOException { - File file = File.createTempFile("junit", null, warehouseDir.toFile()); - assertThat(file.delete()).isTrue(); - return FileHelpers.writeDeleteFile( - table, - Files.localOutput(file), - new PartitionData(PartitionSpec.unpartitioned().partitionType()), - Lists.newArrayList(SimpleDataUtil.createRecord(id, oldData)), - SCHEMA_WITH_PRIMARY_KEY); - } - - private DeleteFile writePosDelete( - Table table, CharSequence path, Integer pos, Integer id, String oldData) throws IOException { - File file = File.createTempFile("junit", null, warehouseDir.toFile()); - assertThat(file.delete()).isTrue(); - PositionDelete posDelete = PositionDelete.create(); - GenericRecord nested = GenericRecord.create(table.schema()); - nested.set(0, id); - nested.set(1, oldData); - posDelete.set(path, pos, nested); - return FileHelpers.writePosDeleteFile( - table, Files.localOutput(file), null, Lists.newArrayList(posDelete)); - } - - static void trigger(OneInputStreamOperatorTestHarness harness) throws Exception { - long time = System.currentTimeMillis(); - harness.processElement(Trigger.create(time, 0), time); - } - - private static class MemoryLock implements TriggerLockFactory.Lock { - volatile boolean locked = false; - - @Override - public boolean tryLock() { - if (locked) { - return false; - } else { - locked = true; - return true; - } - } - - @Override - public boolean isHeld() { - return locked; - } - - @Override - public void unlock() { - locked = false; - } - } - - private static class MemoryLockFactory implements TriggerLockFactory { - private static final TriggerLockFactory.Lock MAINTENANCE_LOCK = new MemoryLock(); - private static final TriggerLockFactory.Lock RECOVERY_LOCK = new MemoryLock(); - - @Override - public void open() { - // do nothing - } - - @Override - public Lock createLock() { - return MAINTENANCE_LOCK; - } - - @Override - public Lock createRecoveryLock() { - return RECOVERY_LOCK; - } - - @Override - public void close() { - // do nothing - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java deleted file mode 100644 index 68aaf29ac0d1..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.apache.iceberg.actions.SizeBasedFileRewritePlanner.MIN_INPUT_FILES; -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.List; -import java.util.Set; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.streaming.util.ProcessFunctionTestHarnesses; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.Table; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; - -class RewriteUtil { - private RewriteUtil() {} - - static List planDataFileRewrite(TableLoader tableLoader) - throws Exception { - try (OneInputStreamOperatorTestHarness - testHarness = - ProcessFunctionTestHarnesses.forProcessFunction( - new DataFileRewritePlanner( - OperatorTestBase.DUMMY_TABLE_NAME, - OperatorTestBase.DUMMY_TABLE_NAME, - 0, - tableLoader, - 11, - 10_000_000L, - ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue()))) { - testHarness.open(); - - OperatorTestBase.trigger(testHarness); - - assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); - return testHarness.extractOutputValues(); - } - } - - static List executeRewrite( - List elements) throws Exception { - try (OneInputStreamOperatorTestHarness< - DataFileRewritePlanner.PlannedGroup, DataFileRewriteRunner.ExecutedGroup> - testHarness = - ProcessFunctionTestHarnesses.forProcessFunction( - new DataFileRewriteRunner( - OperatorTestBase.DUMMY_TABLE_NAME, OperatorTestBase.DUMMY_TABLE_NAME, 0))) { - testHarness.open(); - - for (DataFileRewritePlanner.PlannedGroup element : elements) { - testHarness.processElement(element, System.currentTimeMillis()); - } - - assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); - return testHarness.extractOutputValues(); - } - } - - static Set newDataFiles(Table table) { - table.refresh(); - return Sets.newHashSet(table.currentSnapshot().addedDataFiles(table.io())); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteCommitter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteCommitter.java deleted file mode 100644 index 9e8f2ec92162..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteCommitter.java +++ /dev/null @@ -1,278 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.apache.iceberg.flink.maintenance.operator.RewriteUtil.executeRewrite; -import static org.apache.iceberg.flink.maintenance.operator.RewriteUtil.planDataFileRewrite; -import static org.apache.iceberg.metrics.CommitMetricsResult.TOTAL_DATA_FILES; -import static org.assertj.core.api.Assertions.assertThat; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.when; - -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.iceberg.ContentFile; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.junit.jupiter.api.Test; - -class TestDataFileRewriteCommitter extends OperatorTestBase { - @Test - void testUnpartitioned() throws Exception { - Table table = createTable(); - insert(table, 1, "p1"); - insert(table, 2, "p2"); - insert(table, 3, "p3"); - - List planned = planDataFileRewrite(tableLoader()); - assertThat(planned).hasSize(1); - List rewritten = executeRewrite(planned); - assertThat(rewritten).hasSize(1); - - try (OneInputStreamOperatorTestHarness - testHarness = harness()) { - testHarness.open(); - - testHarness.processElement(rewritten.get(0), EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - - testHarness.processWatermark(EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - } - - assertDataFiles( - table, rewritten.get(0).group().addedFiles(), rewritten.get(0).group().rewrittenFiles(), 1); - } - - @Test - void testPartitioned() throws Exception { - Table table = createPartitionedTable(); - insertPartitioned(table, 1, "p1"); - insertPartitioned(table, 2, "p1"); - insertPartitioned(table, 3, "p2"); - insertPartitioned(table, 4, "p2"); - - List planned = planDataFileRewrite(tableLoader()); - assertThat(planned).hasSize(2); - List rewritten = executeRewrite(planned); - assertThat(rewritten).hasSize(2); - assertThat(rewritten.get(0).groupsPerCommit()).isEqualTo(1); - assertThat(rewritten.get(1).groupsPerCommit()).isEqualTo(1); - ensureDifferentGroups(rewritten); - - try (OneInputStreamOperatorTestHarness - testHarness = harness()) { - testHarness.open(); - - testHarness.processElement(rewritten.get(0), EVENT_TIME); - assertDataFiles( - table, - rewritten.get(0).group().addedFiles(), - rewritten.get(0).group().rewrittenFiles(), - 3); - - testHarness.processElement(rewritten.get(1), EVENT_TIME); - assertDataFiles( - table, - rewritten.get(1).group().addedFiles(), - rewritten.get(1).group().rewrittenFiles(), - 2); - - assertThat(testHarness.extractOutputValues()).isEmpty(); - - testHarness.processWatermark(EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - } - } - - @Test - void testNewTable() throws Exception { - Table table = createTable(); - List rewritten; - - try (OneInputStreamOperatorTestHarness - testHarness = harness()) { - testHarness.open(); - - insert(table, 1, "p1"); - insert(table, 2, "p2"); - insert(table, 3, "p3"); - - List planned = planDataFileRewrite(tableLoader()); - assertThat(planned).hasSize(1); - rewritten = executeRewrite(planned); - assertThat(rewritten).hasSize(1); - - testHarness.processElement(rewritten.get(0), EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - - testHarness.processWatermark(EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - } - - assertDataFiles( - table, rewritten.get(0).group().addedFiles(), rewritten.get(0).group().rewrittenFiles(), 1); - } - - @Test - void testBatchSize() throws Exception { - Table table = createPartitionedTable(); - insertPartitioned(table, 1, "p1"); - insertPartitioned(table, 2, "p1"); - insertPartitioned(table, 3, "p2"); - insertPartitioned(table, 4, "p2"); - insertPartitioned(table, 5, "p3"); - insertPartitioned(table, 6, "p3"); - - List planned = planDataFileRewrite(tableLoader()); - assertThat(planned).hasSize(3); - List rewritten = executeRewrite(planned); - assertThat(rewritten).hasSize(3); - ensureDifferentGroups(rewritten); - - try (OneInputStreamOperatorTestHarness - testHarness = harness()) { - testHarness.open(); - - testHarness.processElement(setBatchSizeToTwo(rewritten.get(0)), EVENT_TIME); - assertNoChange(table); - testHarness.processElement(setBatchSizeToTwo(rewritten.get(1)), EVENT_TIME); - - Set added = Sets.newHashSet(rewritten.get(0).group().addedFiles()); - added.addAll(rewritten.get(1).group().addedFiles()); - Set removed = Sets.newHashSet(rewritten.get(0).group().rewrittenFiles()); - removed.addAll(rewritten.get(1).group().rewrittenFiles()); - assertDataFiles(table, added, removed, 4); - - testHarness.processElement(setBatchSizeToTwo(rewritten.get(2)), EVENT_TIME); - assertNoChange(table); - - assertThat(testHarness.extractOutputValues()).isEmpty(); - - testHarness.processWatermark(EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - } - - // This should be committed on close - assertDataFiles( - table, rewritten.get(2).group().addedFiles(), rewritten.get(2).group().rewrittenFiles(), 3); - } - - @Test - void testError() throws Exception { - Table table = createPartitionedTable(); - insertPartitioned(table, 1, "p1"); - insertPartitioned(table, 2, "p1"); - insertPartitioned(table, 3, "p2"); - insertPartitioned(table, 4, "p2"); - insertPartitioned(table, 5, "p3"); - insertPartitioned(table, 6, "p3"); - insertPartitioned(table, 7, "p4"); - insertPartitioned(table, 8, "p4"); - - List planned = planDataFileRewrite(tableLoader()); - assertThat(planned).hasSize(4); - List rewritten = executeRewrite(planned); - assertThat(rewritten).hasSize(4); - - try (OneInputStreamOperatorTestHarness - testHarness = harness()) { - testHarness.open(); - - testHarness.processElement(setBatchSizeToTwo(rewritten.get(0)), EVENT_TIME); - assertNoChange(table); - assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); - - DataFileRewriteRunner.ExecutedGroup group = spy(setBatchSizeToTwo(rewritten.get(1))); - when(group.group()).thenThrow(new RuntimeException("Testing error")); - testHarness.processElement(group, EVENT_TIME); - - assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).hasSize(1); - assertThat( - testHarness - .getSideOutput(TaskResultAggregator.ERROR_STREAM) - .poll() - .getValue() - .getMessage()) - .contains("Testing error"); - } - } - - private OneInputStreamOperatorTestHarness harness() - throws Exception { - return new OneInputStreamOperatorTestHarness<>( - new DataFileRewriteCommitter( - OperatorTestBase.DUMMY_TABLE_NAME, - OperatorTestBase.DUMMY_TABLE_NAME, - 0, - tableLoader())); - } - - private static DataFileRewriteRunner.ExecutedGroup setBatchSizeToTwo( - DataFileRewriteRunner.ExecutedGroup from) { - return new DataFileRewriteRunner.ExecutedGroup(from.snapshotId(), 2, from.group()); - } - - // Ensure that the groups are different, so the tests are not accidentally passing - private static void ensureDifferentGroups(List rewritten) { - List resultFiles = - rewritten.stream() - .flatMap(task -> task.group().addedFiles().stream().map(ContentFile::location)) - .collect(Collectors.toList()); - assertThat(resultFiles).hasSize(Set.copyOf(resultFiles).size()); - } - - /** - * Assert that the number of the data files in the table is as expected. Additionally, tests that - * the last commit contains the expected added and removed files. - * - * @param table the table to check - * @param expectedAdded the expected added data files - * @param expectedRemoved the expected removed data files - * @param expectedCurrent the expected current data files count - */ - private static void assertDataFiles( - Table table, - Set expectedAdded, - Set expectedRemoved, - long expectedCurrent) { - table.refresh(); - - assertThat(table.currentSnapshot().summary().get(TOTAL_DATA_FILES)) - .isEqualTo(String.valueOf(expectedCurrent)); - Set actualAdded = Sets.newHashSet(table.currentSnapshot().addedDataFiles(table.io())); - Set actualRemoved = - Sets.newHashSet(table.currentSnapshot().removedDataFiles(table.io())); - assertThat(actualAdded.stream().map(DataFile::location).collect(Collectors.toSet())) - .isEqualTo(expectedAdded.stream().map(DataFile::location).collect(Collectors.toSet())); - assertThat(actualRemoved.stream().map(DataFile::location).collect(Collectors.toSet())) - .isEqualTo(expectedRemoved.stream().map(DataFile::location).collect(Collectors.toSet())); - } - - private static void assertNoChange(Table table) { - long original = table.currentSnapshot().snapshotId(); - table.refresh(); - - assertThat(table.currentSnapshot().snapshotId()).isEqualTo(original); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java deleted file mode 100644 index 2d83f553e576..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.apache.iceberg.actions.SizeBasedFileRewritePlanner.MIN_INPUT_FILES; -import static org.apache.iceberg.flink.maintenance.operator.RewriteUtil.newDataFiles; -import static org.apache.iceberg.flink.maintenance.operator.RewriteUtil.planDataFileRewrite; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.streaming.util.ProcessFunctionTestHarnesses; -import org.apache.iceberg.ContentFile; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileContent; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.Table; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.junit.jupiter.api.Test; - -class TestDataFileRewritePlanner extends OperatorTestBase { - @Test - void testFailsOnV3Table() throws Exception { - Table table = createTable("3"); - Set expected = Sets.newHashSetWithExpectedSize(3); - insert(table, 1, "a"); - expected.addAll(newDataFiles(table)); - - assertThatThrownBy(() -> planDataFileRewrite(tableLoader())) - .hasMessageContaining( - "Flink does not support compaction on row lineage enabled tables (V3+)") - .isInstanceOf(IllegalArgumentException.class); - } - - @Test - void testUnpartitioned() throws Exception { - Set expected = Sets.newHashSetWithExpectedSize(3); - Table table = createTable(); - insert(table, 1, "a"); - expected.addAll(newDataFiles(table)); - insert(table, 2, "b"); - expected.addAll(newDataFiles(table)); - insert(table, 3, "c"); - expected.addAll(newDataFiles(table)); - - List actual = planDataFileRewrite(tableLoader()); - - assertThat(actual).hasSize(1); - assertRewriteFileGroup(actual.get(0), table, expected); - } - - @Test - void testPartitioned() throws Exception { - Set expectedP1 = Sets.newHashSetWithExpectedSize(2); - Set expectedP2 = Sets.newHashSetWithExpectedSize(2); - Table table = createPartitionedTable(); - insertPartitioned(table, 1, "p1"); - expectedP1.addAll(newDataFiles(table)); - insertPartitioned(table, 2, "p1"); - expectedP1.addAll(newDataFiles(table)); - - insertPartitioned(table, 3, "p2"); - expectedP2.addAll(newDataFiles(table)); - insertPartitioned(table, 4, "p2"); - expectedP2.addAll(newDataFiles(table)); - - // This should not participate in compaction, as there is no more files in the partition - insertPartitioned(table, 5, "p3"); - - List actual = planDataFileRewrite(tableLoader()); - - assertThat(actual).hasSize(2); - if (actual.get(0).group().info().partition().get(0, String.class).equals("p1")) { - assertRewriteFileGroup(actual.get(0), table, expectedP1); - assertRewriteFileGroup(actual.get(1), table, expectedP2); - } else { - assertRewriteFileGroup(actual.get(0), table, expectedP2); - assertRewriteFileGroup(actual.get(1), table, expectedP1); - } - } - - @Test - void testError() throws Exception { - Table table = createTable(); - insert(table, 1, "a"); - insert(table, 2, "b"); - - try (OneInputStreamOperatorTestHarness - testHarness = - ProcessFunctionTestHarnesses.forProcessFunction( - new DataFileRewritePlanner( - OperatorTestBase.DUMMY_TABLE_NAME, - OperatorTestBase.DUMMY_TABLE_NAME, - 0, - tableLoader(), - 11, - 1L, - ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue()))) { - testHarness.open(); - - // Cause an exception - dropTable(); - - assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); - trigger(testHarness); - assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).hasSize(1); - assertThat( - testHarness - .getSideOutput(TaskResultAggregator.ERROR_STREAM) - .poll() - .getValue() - .getMessage()) - .contains("Table does not exist: "); - } - } - - @Test - void testV2Table() throws Exception { - Table table = createTableWithDelete(); - update(table, 1, null, "a", "b"); - update(table, 1, "b", "c"); - - List actual = planDataFileRewrite(tableLoader()); - - assertThat(actual).hasSize(1); - List tasks = actual.get(0).group().fileScanTasks(); - assertThat(tasks).hasSize(2); - // Find the task with the deletes - FileScanTask withDelete = tasks.get(0).deletes().isEmpty() ? tasks.get(1) : tasks.get(0); - assertThat(withDelete.deletes()).hasSize(2); - assertThat(withDelete.deletes().stream().map(ContentFile::content).collect(Collectors.toList())) - .containsExactlyInAnyOrder(FileContent.POSITION_DELETES, FileContent.EQUALITY_DELETES); - } - - @Test - void testMaxRewriteBytes() throws Exception { - Table table = createPartitionedTable(); - insertPartitioned(table, 1, "p1"); - insertPartitioned(table, 2, "p1"); - insertPartitioned(table, 3, "p2"); - insertPartitioned(table, 4, "p2"); - - // First run with high maxRewriteBytes - List planWithNoMaxRewriteBytes = - planDataFileRewrite(tableLoader()); - assertThat(planWithNoMaxRewriteBytes).hasSize(2); - - // Second run with low maxRewriteBytes, the 2nd group should be removed from the plan - long maxRewriteBytes = - planWithNoMaxRewriteBytes.get(0).group().fileScanTasks().get(0).sizeBytes() - + planWithNoMaxRewriteBytes.get(1).group().fileScanTasks().get(0).sizeBytes() - + 1; - try (OneInputStreamOperatorTestHarness - testHarness = - ProcessFunctionTestHarnesses.forProcessFunction( - new DataFileRewritePlanner( - OperatorTestBase.DUMMY_TABLE_NAME, - OperatorTestBase.DUMMY_TABLE_NAME, - 0, - tableLoader(), - 11, - maxRewriteBytes, - ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue()))) { - testHarness.open(); - - OperatorTestBase.trigger(testHarness); - - assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); - // Only a single group is planned - assertThat(testHarness.extractOutputValues()).hasSize(1); - } - } - - void assertRewriteFileGroup( - DataFileRewritePlanner.PlannedGroup plannedGroup, Table table, Set files) { - assertThat(plannedGroup.table().currentSnapshot().snapshotId()) - .isEqualTo(table.currentSnapshot().snapshotId()); - assertThat(plannedGroup.groupsPerCommit()).isEqualTo(1); - assertThat( - plannedGroup.group().fileScanTasks().stream() - .map(s -> s.file().location()) - .collect(Collectors.toSet())) - .containsExactlyInAnyOrderElementsOf( - files.stream().map(ContentFile::location).collect(Collectors.toList())); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java deleted file mode 100644 index 3c5a10328756..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java +++ /dev/null @@ -1,357 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.apache.iceberg.actions.RewriteDataFiles.TARGET_FILE_SIZE_BYTES; -import static org.apache.iceberg.actions.SizeBasedFileRewritePlanner.MIN_INPUT_FILES; -import static org.apache.iceberg.flink.maintenance.operator.RewriteUtil.executeRewrite; -import static org.apache.iceberg.flink.maintenance.operator.RewriteUtil.planDataFileRewrite; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.util.Collection; -import java.util.List; -import java.util.Set; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.streaming.util.ProcessFunctionTestHarnesses; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileContent; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionData; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.Table; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.data.parquet.GenericParquetReaders; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.ValueSource; - -class TestDataFileRewriteRunner extends OperatorTestBase { - @ParameterizedTest - @ValueSource(booleans = {true, false}) - void testExecute(boolean partitioned) throws Exception { - Table table; - PartitionData partition; - if (partitioned) { - table = createPartitionedTable(); - partition = new PartitionData(table.spec().partitionType()); - partition.set(0, "p1"); - insertPartitioned(table, 1, "p1"); - insertPartitioned(table, 2, "p1"); - insertPartitioned(table, 3, "p1"); - } else { - table = createTable(); - partition = new PartitionData(PartitionSpec.unpartitioned().partitionType()); - insert(table, 1, "p1"); - insert(table, 2, "p1"); - insert(table, 3, "p1"); - } - - List planned = planDataFileRewrite(tableLoader()); - assertThat(planned).hasSize(1); - List actual = executeRewrite(planned); - assertThat(actual).hasSize(1); - - assertRewriteFileGroup( - actual.get(0), - table, - records( - table.schema(), - ImmutableSet.of( - ImmutableList.of(1, "p1"), ImmutableList.of(2, "p1"), ImmutableList.of(3, "p1"))), - 1, - ImmutableSet.of(partition)); - } - - @Test - void testPartitionSpecChange() throws Exception { - Table table = createPartitionedTable(); - insertPartitioned(table, 1, "p1"); - insertPartitioned(table, 2, "p1"); - PartitionData oldPartition = new PartitionData(table.spec().partitionType()); - oldPartition.set(0, "p1"); - - try (OneInputStreamOperatorTestHarness< - DataFileRewritePlanner.PlannedGroup, DataFileRewriteRunner.ExecutedGroup> - testHarness = - ProcessFunctionTestHarnesses.forProcessFunction( - new DataFileRewriteRunner( - OperatorTestBase.DUMMY_TABLE_NAME, OperatorTestBase.DUMMY_TABLE_NAME, 0))) { - testHarness.open(); - - List planned = planDataFileRewrite(tableLoader()); - assertThat(planned).hasSize(1); - - testHarness.processElement(planned.get(0), System.currentTimeMillis()); - List actual = testHarness.extractOutputValues(); - assertThat(actual).hasSize(1); - assertRewriteFileGroup( - actual.get(0), - table, - records( - table.schema(), - ImmutableSet.of(ImmutableList.of(1, "p1"), ImmutableList.of(2, "p1"))), - 1, - ImmutableSet.of(oldPartition)); - - insertPartitioned(table, 3, "p1"); - - planned = planDataFileRewrite(tableLoader()); - assertThat(planned).hasSize(1); - - testHarness.processElement(planned.get(0), System.currentTimeMillis()); - actual = testHarness.extractOutputValues(); - assertThat(actual).hasSize(2); - assertRewriteFileGroup( - actual.get(1), - table, - records( - table.schema(), - ImmutableSet.of( - ImmutableList.of(1, "p1"), ImmutableList.of(2, "p1"), ImmutableList.of(3, "p1"))), - 1, - ImmutableSet.of(oldPartition)); - - // Alter the table schema - table.updateSpec().addField("id").commit(); - // Insert some now data - insertFullPartitioned(table, 4, "p1"); - insertFullPartitioned(table, 4, "p1"); - PartitionData newPartition = new PartitionData(table.spec().partitionType()); - newPartition.set(0, "p1"); - newPartition.set(1, 4); - table.refresh(); - - planned = planDataFileRewrite(tableLoader()); - assertThat(planned).hasSize(2); - DataFileRewritePlanner.PlannedGroup oldCompact = planned.get(0); - DataFileRewritePlanner.PlannedGroup newCompact = planned.get(1); - if (oldCompact.group().inputFileNum() == 2) { - newCompact = planned.get(0); - oldCompact = planned.get(1); - } - - testHarness.processElement(newCompact, System.currentTimeMillis()); - actual = testHarness.extractOutputValues(); - assertThat(actual).hasSize(3); - assertRewriteFileGroup( - actual.get(2), - table, - records( - table.schema(), - ImmutableList.of(ImmutableList.of(4, "p1"), ImmutableList.of(4, "p1"))), - 1, - ImmutableSet.of(newPartition)); - - testHarness.processElement(oldCompact, System.currentTimeMillis()); - actual = testHarness.extractOutputValues(); - assertThat(actual).hasSize(4); - PartitionData[] transformedPartitions = { - newPartition.copy(), newPartition.copy(), newPartition.copy() - }; - transformedPartitions[0].set(1, 1); - transformedPartitions[1].set(1, 2); - transformedPartitions[2].set(1, 3); - assertRewriteFileGroup( - actual.get(3), - table, - records( - table.schema(), - ImmutableSet.of( - ImmutableList.of(1, "p1"), ImmutableList.of(2, "p1"), ImmutableList.of(3, "p1"))), - 3, - Sets.newHashSet(transformedPartitions)); - } - } - - @Test - void testError() throws Exception { - Table table = createTable(); - insert(table, 1, "a"); - insert(table, 2, "b"); - - try (OneInputStreamOperatorTestHarness< - DataFileRewritePlanner.PlannedGroup, DataFileRewriteRunner.ExecutedGroup> - testHarness = - ProcessFunctionTestHarnesses.forProcessFunction( - new DataFileRewriteRunner( - OperatorTestBase.DUMMY_TABLE_NAME, OperatorTestBase.DUMMY_TABLE_NAME, 0))) { - testHarness.open(); - - List planned = planDataFileRewrite(tableLoader()); - assertThat(planned).hasSize(1); - // Cause an exception - dropTable(); - - assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); - testHarness.processElement(planned.get(0), System.currentTimeMillis()); - assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).hasSize(1); - assertThat( - testHarness - .getSideOutput(TaskResultAggregator.ERROR_STREAM) - .poll() - .getValue() - .getMessage()) - .contains("File does not exist: "); - } - } - - @Test - void testV2Table() throws Exception { - Table table = createTableWithDelete(); - update(table, 1, null, "a", "b"); - update(table, 1, "b", "c"); - - List planned = planDataFileRewrite(tableLoader()); - assertThat(planned).hasSize(1); - - List actual = executeRewrite(planned); - assertThat(actual).hasSize(1); - - assertRewriteFileGroup( - actual.get(0), - table, - records(table.schema(), ImmutableSet.of(ImmutableList.of(1, "c"))), - 1, - ImmutableSet.of(new PartitionData(PartitionSpec.unpartitioned().partitionType()))); - } - - @Test - void testSplitSize() throws Exception { - Table table = createTable(); - - File dataDir = new File(new Path(table.location(), "data").toUri().getPath()); - dataDir.mkdir(); - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(table, FileFormat.PARQUET, dataDir.toPath()); - List expected = Lists.newArrayListWithExpectedSize(4000); - for (int i = 0; i < 4; ++i) { - List batch = RandomGenericData.generate(table.schema(), 1000, 10 + i); - dataAppender.appendToTable(batch); - expected.addAll(batch); - } - - // First run with high target file size - List planWithNoTargetFileSize = - planDataFileRewrite(tableLoader()); - assertThat(planWithNoTargetFileSize).hasSize(1); - - // Second run with low target file size - long targetFileSize = - planWithNoTargetFileSize.get(0).group().fileScanTasks().get(0).sizeBytes() - + planWithNoTargetFileSize.get(0).group().fileScanTasks().get(1).sizeBytes(); - List planned; - try (OneInputStreamOperatorTestHarness - testHarness = - ProcessFunctionTestHarnesses.forProcessFunction( - new DataFileRewritePlanner( - OperatorTestBase.DUMMY_TABLE_NAME, - OperatorTestBase.DUMMY_TABLE_NAME, - 0, - tableLoader(), - 11, - 10_000_000, - ImmutableMap.of( - MIN_INPUT_FILES, - "2", - TARGET_FILE_SIZE_BYTES, - String.valueOf(targetFileSize)), - Expressions.alwaysTrue()))) { - testHarness.open(); - - OperatorTestBase.trigger(testHarness); - - assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); - planned = testHarness.extractOutputValues(); - assertThat(planned).hasSize(1); - } - - List actual = executeRewrite(planned); - assertThat(actual).hasSize(1); - - assertRewriteFileGroup( - actual.get(0), - table, - expected, - 2, - ImmutableSet.of(new PartitionData(PartitionSpec.unpartitioned().partitionType()))); - } - - void assertRewriteFileGroup( - DataFileRewriteRunner.ExecutedGroup actual, - Table table, - Collection expectedRecords, - int expectedFileNum, - Set expectedPartitions) - throws IOException { - assertThat(actual.snapshotId()).isEqualTo(table.currentSnapshot().snapshotId()); - assertThat(actual.groupsPerCommit()).isEqualTo(1); - assertThat(actual.group().addedFiles()).hasSize(expectedFileNum); - Collection writtenRecords = Lists.newArrayListWithExpectedSize(expectedRecords.size()); - Set writtenPartitions = Sets.newHashSetWithExpectedSize(expectedPartitions.size()); - for (DataFile newDataFile : actual.group().addedFiles()) { - assertThat(newDataFile.format()).isEqualTo(FileFormat.PARQUET); - assertThat(newDataFile.content()).isEqualTo(FileContent.DATA); - assertThat(newDataFile.keyMetadata()).isNull(); - writtenPartitions.add(newDataFile.partition()); - - try (CloseableIterable reader = - Parquet.read(table.io().newInputFile(newDataFile.location())) - .project(table.schema()) - .createReaderFunc( - fileSchema -> GenericParquetReaders.buildReader(table.schema(), fileSchema)) - .build()) { - List newRecords = Lists.newArrayList(reader); - assertThat(newRecords).hasSize((int) newDataFile.recordCount()); - writtenRecords.addAll(newRecords); - } - } - - assertThat(writtenRecords).containsExactlyInAnyOrderElementsOf(expectedRecords); - assertThat(writtenPartitions).isEqualTo(expectedPartitions); - } - - private List records(Schema schema, Collection> data) { - GenericRecord record = GenericRecord.create(schema); - - ImmutableList.Builder builder = ImmutableList.builder(); - data.forEach( - recordData -> - builder.add( - record.copy(ImmutableMap.of("id", recordData.get(0), "data", recordData.get(1))))); - - return builder.build(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDeleteFilesProcessor.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDeleteFilesProcessor.java deleted file mode 100644 index 7511e1029b6f..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDeleteFilesProcessor.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.nio.file.FileSystems; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Set; -import java.util.stream.Collectors; -import org.apache.flink.api.common.typeutils.base.StringSerializer; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -class TestDeleteFilesProcessor extends OperatorTestBase { - private static final String DUMMY_FILE_NAME = "dummy"; - private static final Set TABLE_FILES = - ImmutableSet.of( - "metadata/v1.metadata.json", - "metadata/version-hint.text", - "metadata/.version-hint.text.crc", - "metadata/.v1.metadata.json.crc"); - - private Table table; - - @BeforeEach - void before() { - this.table = createTable(); - } - - @Test - void testDelete() throws Exception { - // Write an extra file - Path dummyFile = Path.of(tablePath(table).toString(), DUMMY_FILE_NAME); - Files.write(dummyFile, "DUMMY".getBytes(StandardCharsets.UTF_8)); - - Set files = listFiles(table); - assertThat(files) - .containsAll(TABLE_FILES) - .contains(DUMMY_FILE_NAME) - .hasSize(TABLE_FILES.size() + 1); - - deleteFile(tableLoader(), dummyFile.toString()); - - assertThat(listFiles(table)).isEqualTo(TABLE_FILES); - } - - @Test - void testDeleteMissingFile() throws Exception { - Path dummyFile = - FileSystems.getDefault().getPath(table.location().substring(5), DUMMY_FILE_NAME); - - deleteFile(tableLoader(), dummyFile.toString()); - - assertThat(listFiles(table)).isEqualTo(TABLE_FILES); - } - - @Test - void testInvalidURIScheme() throws Exception { - deleteFile(tableLoader(), "wrong://"); - - assertThat(listFiles(table)).isEqualTo(TABLE_FILES); - } - - private void deleteFile(TableLoader tableLoader, String fileName) throws Exception { - tableLoader().open(); - try (OneInputStreamOperatorTestHarness testHarness = - new OneInputStreamOperatorTestHarness<>( - new DeleteFilesProcessor(table, DUMMY_TASK_NAME, 0, 10), StringSerializer.INSTANCE)) { - testHarness.open(); - testHarness.processElement(fileName, System.currentTimeMillis()); - testHarness.processWatermark(EVENT_TIME); - testHarness.endInput(); - } - } - - private static Path tablePath(Table table) { - return FileSystems.getDefault().getPath(table.location().substring(5)); - } - - private static Set listFiles(Table table) throws IOException { - String tableRootPath = TestFixtures.TABLE_IDENTIFIER.toString().replace(".", "/"); - return Files.find( - tablePath(table), Integer.MAX_VALUE, (filePath, fileAttr) -> fileAttr.isRegularFile()) - .map( - p -> - p.toString() - .substring(p.toString().indexOf(tableRootPath) + tableRootPath.length() + 1)) - .collect(Collectors.toSet()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestExpireSnapshotsProcessor.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestExpireSnapshotsProcessor.java deleted file mode 100644 index f073272a70b7..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestExpireSnapshotsProcessor.java +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.List; -import java.util.Queue; -import java.util.Set; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.streaming.util.ProcessFunctionTestHarnesses; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.maintenance.api.TaskResult; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.ValueSource; - -class TestExpireSnapshotsProcessor extends OperatorTestBase { - @ParameterizedTest - @ValueSource(booleans = {true, false}) - void testExpire(boolean success) throws Exception { - Table table = createTable(); - insert(table, 1, "a"); - insert(table, 2, "b"); - - List actual; - Queue> deletes; - try (OneInputStreamOperatorTestHarness testHarness = - ProcessFunctionTestHarnesses.forProcessFunction( - new ExpireSnapshotsProcessor(tableLoader(), 0L, 1, 10, false))) { - testHarness.open(); - - if (!success) { - // Cause an exception - dropTable(); - } - - testHarness.processElement(Trigger.create(10, 11), System.currentTimeMillis()); - deletes = testHarness.getSideOutput(ExpireSnapshotsProcessor.DELETE_STREAM); - actual = testHarness.extractOutputValues(); - } - - assertThat(actual).hasSize(1); - TaskResult result = actual.get(0); - assertThat(result.startEpoch()).isEqualTo(10); - assertThat(result.taskIndex()).isEqualTo(11); - assertThat(result.success()).isEqualTo(success); - - if (success) { - assertThat(result.exceptions()).isNotNull().isEmpty(); - - table.refresh(); - Set snapshots = Sets.newHashSet(table.snapshots()); - assertThat(snapshots).hasSize(1); - assertThat(deletes).hasSize(1); - } else { - assertThat(result.exceptions()).isNotNull().hasSize(1); - assertThat(deletes).isNull(); - } - } - - @ParameterizedTest - @ValueSource(booleans = {true, false}) - void testCleanExpiredMetadata(boolean cleanExpiredMetadata) throws Exception { - Table table = createTable(); - insert(table, 1, "a"); - table.updateSchema().addColumn("extra", Types.StringType.get()).commit(); - insert(table, 2, "b", "x"); - - assertThat(table.schemas()).hasSize(2); - - List actual; - Queue> deletes; - try (OneInputStreamOperatorTestHarness testHarness = - ProcessFunctionTestHarnesses.forProcessFunction( - new ExpireSnapshotsProcessor(tableLoader(), 0L, 1, 10, cleanExpiredMetadata))) { - testHarness.open(); - - testHarness.processElement(Trigger.create(10, 11), System.currentTimeMillis()); - deletes = testHarness.getSideOutput(ExpireSnapshotsProcessor.DELETE_STREAM); - actual = testHarness.extractOutputValues(); - } - - assertThat(actual).hasSize(1); - TaskResult result = actual.get(0); - assertThat(result.startEpoch()).isEqualTo(10); - assertThat(result.taskIndex()).isEqualTo(11); - assertThat(result.success()).isEqualTo(true); - assertThat(result.exceptions()).isNotNull().isEmpty(); - - table.refresh(); - Set snapshots = Sets.newHashSet(table.snapshots()); - assertThat(snapshots).hasSize(1); - assertThat(deletes).hasSize(1); - - if (cleanExpiredMetadata) { - assertThat(table.schemas().values()).containsExactly(table.schema()); - } else { - assertThat(table.schemas()).hasSize(2); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListFileSystemFiles.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListFileSystemFiles.java deleted file mode 100644 index 12478bb33fb2..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListFileSystemFiles.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.streaming.util.ProcessFunctionTestHarnesses; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -class TestListFileSystemFiles extends OperatorTestBase { - @Parameter(index = 0) - private boolean usePrefixListing; - - @Parameters(name = "usePrefixListing = {0}") - private static Object[][] parameters() { - return new Object[][] {{true}, {false}}; - } - - @TestTemplate - void testMetadataFilesWithTable() throws Exception { - Table table = createTable(); - insert(table, 1, "a"); - insert(table, 2, "b"); - insert(table, 3, "c"); - try (OneInputStreamOperatorTestHarness testHarness = - ProcessFunctionTestHarnesses.forProcessFunction( - new ListFileSystemFiles( - OperatorTestBase.DUMMY_TABLE_NAME, - 0, - tableLoader(), - table.location(), - 0, - usePrefixListing))) { - testHarness.open(); - OperatorTestBase.trigger(testHarness); - - assertThat(testHarness.extractOutputValues()).hasSize(11); - assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); - } - } - - @TestTemplate - void testMetadataFilesWithPartitionTable() throws Exception { - Table table = createPartitionedTable(); - insertPartitioned(table, 1, "p1"); - insertPartitioned(table, 2, "p1"); - insertPartitioned(table, 3, "p2"); - insertPartitioned(table, 4, "p2"); - try (OneInputStreamOperatorTestHarness testHarness = - ProcessFunctionTestHarnesses.forProcessFunction( - new ListFileSystemFiles( - OperatorTestBase.DUMMY_TABLE_NAME, - 0, - tableLoader(), - table.location(), - 0, - usePrefixListing))) { - testHarness.open(); - OperatorTestBase.trigger(testHarness); - - assertThat(testHarness.extractOutputValues()).hasSize(14); - assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); - } - } - - @TestTemplate - void testMetadataFilesWithEmptyTable() throws Exception { - Table table = createTable(); - try (OneInputStreamOperatorTestHarness testHarness = - ProcessFunctionTestHarnesses.forProcessFunction( - new ListFileSystemFiles( - OperatorTestBase.DUMMY_TABLE_NAME, - 0, - tableLoader(), - table.location(), - 0, - usePrefixListing))) { - testHarness.open(); - OperatorTestBase.trigger(testHarness); - - assertThat(testHarness.extractOutputValues()).hasSize(2); - assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListMetadataFiles.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListMetadataFiles.java deleted file mode 100644 index bb8c74f3d5e9..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestListMetadataFiles.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.List; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.streaming.util.ProcessFunctionTestHarnesses; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.junit.jupiter.api.Test; - -class TestListMetadataFiles extends OperatorTestBase { - - @Test - void testMetadataFilesWithTable() throws Exception { - Table table = createTable(); - insert(table, 1, "a"); - insert(table, 2, "b"); - insert(table, 3, "c"); - try (OneInputStreamOperatorTestHarness testHarness = - ProcessFunctionTestHarnesses.forProcessFunction( - new ListMetadataFiles(OperatorTestBase.DUMMY_TABLE_NAME, 0, tableLoader()))) { - testHarness.open(); - - OperatorTestBase.trigger(testHarness); - - List tableMetadataFiles = testHarness.extractOutputValues(); - tableMetadataFiles.forEach(System.out::println); - assertThat(tableMetadataFiles).hasSize(24); - - assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); - } - } - - @Test - void testMetadataFilesWithPartitionTable() throws Exception { - Table table = createPartitionedTable(); - insertPartitioned(table, 1, "p1"); - insertPartitioned(table, 2, "p1"); - insertPartitioned(table, 3, "p2"); - insertPartitioned(table, 4, "p2"); - try (OneInputStreamOperatorTestHarness testHarness = - ProcessFunctionTestHarnesses.forProcessFunction( - new ListMetadataFiles(OperatorTestBase.DUMMY_TABLE_NAME, 0, tableLoader()))) { - testHarness.open(); - - OperatorTestBase.trigger(testHarness); - - List tableMetadataFiles = testHarness.extractOutputValues(); - assertThat(tableMetadataFiles).hasSize(38); - - assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); - } - } - - @Test - void testMetadataFilesWithEmptyTable() throws Exception { - createTable(); - try (OneInputStreamOperatorTestHarness testHarness = - ProcessFunctionTestHarnesses.forProcessFunction( - new ListMetadataFiles(OperatorTestBase.DUMMY_TABLE_NAME, 0, tableLoader()))) { - testHarness.open(); - - OperatorTestBase.trigger(testHarness); - - List tableMetadataFiles = testHarness.extractOutputValues(); - assertThat(tableMetadataFiles).hasSize(0); - - assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockConfig.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockConfig.java deleted file mode 100644 index cec76019ae10..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockConfig.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.Map; -import org.apache.flink.configuration.Configuration; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.maintenance.api.LockConfig; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -public class TestLockConfig extends OperatorTestBase { - private static final String TABLE_NAME = "catalog.db.table"; - private static final String LOCK_ID = "test-lock-id"; - private Map input = Maps.newHashMap(); - private Table table; - - @BeforeEach - public void before() { - input.put("flink-maintenance.lock.type", "jdbc"); - input.put("flink-maintenance.lock.lock-id", LOCK_ID); - input.put("other.config", "should-be-ignored"); - this.table = createTable(); - } - - @AfterEach - public void after() { - input.clear(); - } - - @Test - void testConfigParsing() { - LockConfig config = new LockConfig(table, input, new Configuration()); - - assertThat(config.lockType()).isEqualTo("jdbc"); - assertThat(config.lockId(LOCK_ID)).isEqualTo(LOCK_ID); - } - - @Test - void testEmptyConfig() { - LockConfig config = new LockConfig(table, Maps.newHashMap(), new Configuration()); - - assertThat(config.lockType()).isEmpty(); - assertThat(config.lockId(TABLE_NAME)).isEqualTo(TABLE_NAME); - } - - @Test - void testWriteOptionReplaceSetConfig() { - Configuration configuration = new Configuration(); - configuration.setString("flink-maintenance.lock.type", "zk"); - configuration.setString("flink-maintenance.lock.replace-item", "test-config"); - configuration.setString("flink-maintenance.lock.jdbc.init-lock-table", "true"); - LockConfig config = new LockConfig(table, input, configuration); - - // set config should be ignored - assertThat(config.lockType()).isEqualTo("jdbc"); - assertThat(config.jdbcInitTable()).isEqualTo("true"); - - assertThat(config.properties()) - .doesNotContainKey("other.config") - .containsEntry("type", "jdbc") - .containsEntry("replace-item", "test-config"); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockFactoryBuilder.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockFactoryBuilder.java deleted file mode 100644 index d32d5f840c4b..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockFactoryBuilder.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.IOException; -import java.util.Map; -import org.apache.curator.test.TestingServer; -import org.apache.flink.configuration.Configuration; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.maintenance.api.LockConfig; -import org.apache.iceberg.flink.maintenance.api.TriggerLockFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -class TestLockFactoryBuilder extends OperatorTestBase { - private static final String TABLE_NAME = "catalog.db.table"; - - private TestingServer zkTestServer; - private Table table; - - @BeforeEach - void before() { - this.table = createTable(); - try { - zkTestServer = new TestingServer(); - } catch (Exception e) { - throw new RuntimeException(e); - } - } - - @AfterEach - public void after() throws IOException { - if (zkTestServer != null) { - zkTestServer.close(); - } - } - - @Test - void testJdbcBuildWithMissingJdbcUri() { - Map config = Maps.newHashMap(); - config.put(LockConfig.LOCK_TYPE_OPTION.key(), LockConfig.JdbcLockConfig.JDBC); - LockConfig lockConfig = new LockConfig(table, config, new Configuration()); - - assertThatThrownBy(() -> LockFactoryBuilder.build(lockConfig, TABLE_NAME)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining( - String.format( - "JDBC lock requires %s parameter", - LockConfig.JdbcLockConfig.JDBC_URI_OPTION.key())); - } - - @Test - void testJdbcBuildSuccessfully() { - Map config = Maps.newHashMap(); - config.put(LockConfig.LOCK_TYPE_OPTION.key(), LockConfig.JdbcLockConfig.JDBC); - config.put(LockConfig.JdbcLockConfig.JDBC_URI_OPTION.key(), "jdbc:sqlite:file::memory:?ic"); - config.put(LockConfig.LOCK_ID_OPTION.key(), "test-lock-id"); - LockConfig lockConfig = new LockConfig(table, config, new Configuration()); - - TriggerLockFactory factory = LockFactoryBuilder.build(lockConfig, TABLE_NAME); - assertThat(factory).isNotNull(); - } - - @Test - void testZkBuildWithMissingUri() { - Map config = Maps.newHashMap(); - config.put(LockConfig.LOCK_TYPE_OPTION.key(), LockConfig.ZkLockConfig.ZK); - LockConfig lockConfig = new LockConfig(table, config, new Configuration()); - - assertThatThrownBy(() -> LockFactoryBuilder.build(lockConfig, TABLE_NAME)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining( - String.format( - "Zk lock requires %s parameter", LockConfig.ZkLockConfig.ZK_URI_OPTION.key())); - } - - @Test - void testZkBuildSuccessfully() { - Map config = Maps.newHashMap(); - config.put(LockConfig.LOCK_TYPE_OPTION.key(), LockConfig.ZkLockConfig.ZK); - config.put(LockConfig.ZkLockConfig.ZK_URI_OPTION.key(), zkTestServer.getConnectString()); - config.put(LockConfig.LOCK_ID_OPTION.key(), "test-lock-id"); - LockConfig lockConfig = new LockConfig(table, config, new Configuration()); - - TriggerLockFactory factory = LockFactoryBuilder.build(lockConfig, TABLE_NAME); - assertThat(factory).isNotNull(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockRemover.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockRemover.java deleted file mode 100644 index a163dcaa71f7..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestLockRemover.java +++ /dev/null @@ -1,443 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.FAILED_TASK_COUNTER; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.LAST_RUN_DURATION_MS; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.SUCCEEDED_TASK_COUNTER; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.util.Collection; -import java.util.List; -import org.apache.flink.api.common.functions.FlatMapFunction; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.connector.sink2.Committer; -import org.apache.flink.api.connector.sink2.CommitterInitContext; -import org.apache.flink.api.connector.sink2.CommittingSinkWriter; -import org.apache.flink.api.connector.sink2.Sink; -import org.apache.flink.api.connector.sink2.SinkWriter; -import org.apache.flink.api.connector.sink2.SupportsCommitter; -import org.apache.flink.configuration.CheckpointingOptions; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.core.execution.JobClient; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; -import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; -import org.apache.flink.streaming.api.connector.sink2.SupportsPostCommitTopology; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.util.Collector; -import org.apache.iceberg.flink.maintenance.api.TaskResult; -import org.apache.iceberg.flink.maintenance.api.TriggerLockFactory; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.Timeout; -import org.junit.jupiter.api.io.TempDir; - -@Timeout(value = 10) -class TestLockRemover extends OperatorTestBase { - private static final String[] TASKS = new String[] {"task0", "task1", "task2"}; - private static final TriggerLockFactory.Lock LOCK = new TestingLock(); - private static final TriggerLockFactory.Lock RECOVERY_LOCK = new TestingLock(); - - @TempDir private File checkpointDir; - - @BeforeEach - void before() { - MetricsReporterFactoryForTests.reset(); - } - - @Test - void testProcess() throws Exception { - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - ManualSource source = new ManualSource<>(env, TypeInformation.of(TaskResult.class)); - source - .dataStream() - .transform( - DUMMY_TASK_NAME, - TypeInformation.of(Void.class), - new LockRemover(DUMMY_TABLE_NAME, new TestingLockFactory(), Lists.newArrayList(TASKS))) - .setParallelism(1); - - JobClient jobClient = null; - try { - jobClient = env.executeAsync(); - - LOCK.tryLock(); - assertThat(LOCK.isHeld()).isTrue(); - - // Start a successful trigger for task1 and assert the return value is correct - processAndCheck(source, new TaskResult(0, 0L, true, Lists.newArrayList())); - - // Assert that the lock is removed - assertThat(LOCK.isHeld()).isFalse(); - } finally { - closeJobClient(jobClient); - } - } - - @Test - void testInSink() throws Exception { - String sinkName = "TestSink"; - Configuration config = new Configuration(); - config.set(CheckpointingOptions.CHECKPOINT_STORAGE, "filesystem"); - config.set(CheckpointingOptions.CHECKPOINTS_DIRECTORY, "file://" + checkpointDir.getPath()); - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(config); - env.enableCheckpointing(10); - ManualSource source = new ManualSource<>(env, TypeInformation.of(TaskResult.class)); - source.dataStream().global().sinkTo(new SinkTest()).name(sinkName).setParallelism(1); - - JobClient jobClient = null; - try { - jobClient = env.executeAsync(); - - LOCK.tryLock(); - assertThat(LOCK.isHeld()).isTrue(); - - // Start a successful trigger for task1 and assert the return value is correct - processAndCheck(source, new TaskResult(0, 0L, true, Lists.newArrayList()), sinkName + ": "); - - // Assert that the lock is removed - assertThat(LOCK.isHeld()).isFalse(); - } finally { - closeJobClient(jobClient); - } - } - - @Test - void testMetrics() throws Exception { - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - ManualSource source = new ManualSource<>(env, TypeInformation.of(TaskResult.class)); - source - .dataStream() - .transform( - DUMMY_TASK_NAME, - TypeInformation.of(Void.class), - new LockRemover(DUMMY_TABLE_NAME, new TestingLockFactory(), Lists.newArrayList(TASKS))) - .setParallelism(1); - - JobClient jobClient = null; - long time = System.currentTimeMillis(); - try { - jobClient = env.executeAsync(); - // Start the 2 successful and one failed result trigger for task1, and 3 successful for task2 - processAndCheck(source, new TaskResult(0, time, true, Lists.newArrayList())); - processAndCheck(source, new TaskResult(1, 0L, true, Lists.newArrayList())); - processAndCheck(source, new TaskResult(1, 0L, true, Lists.newArrayList())); - processAndCheck(source, new TaskResult(0, time, false, Lists.newArrayList())); - processAndCheck(source, new TaskResult(0, time, true, Lists.newArrayList())); - processAndCheck(source, new TaskResult(1, 0L, true, Lists.newArrayList())); - - Awaitility.await() - .until( - () -> - MetricsReporterFactoryForTests.counter( - ImmutableList.of( - DUMMY_TASK_NAME, - DUMMY_TABLE_NAME, - TASKS[1], - "1", - SUCCEEDED_TASK_COUNTER)) - .equals(3L)); - - // Final check all the counters - MetricsReporterFactoryForTests.assertCounters( - new ImmutableMap.Builder, Long>() - .put( - ImmutableList.of( - DUMMY_TASK_NAME, DUMMY_TABLE_NAME, TASKS[0], "0", SUCCEEDED_TASK_COUNTER), - 2L) - .put( - ImmutableList.of( - DUMMY_TASK_NAME, DUMMY_TABLE_NAME, TASKS[0], "0", FAILED_TASK_COUNTER), - 1L) - .put( - ImmutableList.of( - DUMMY_TASK_NAME, DUMMY_TABLE_NAME, TASKS[1], "1", SUCCEEDED_TASK_COUNTER), - 3L) - .put( - ImmutableList.of( - DUMMY_TASK_NAME, DUMMY_TABLE_NAME, TASKS[1], "1", FAILED_TASK_COUNTER), - 0L) - .put( - ImmutableList.of( - DUMMY_TASK_NAME, DUMMY_TABLE_NAME, TASKS[2], "2", SUCCEEDED_TASK_COUNTER), - 0L) - .put( - ImmutableList.of( - DUMMY_TASK_NAME, DUMMY_TABLE_NAME, TASKS[2], "2", FAILED_TASK_COUNTER), - 0L) - .build()); - - assertThat( - MetricsReporterFactoryForTests.gauge( - ImmutableList.of( - DUMMY_TASK_NAME, DUMMY_TABLE_NAME, TASKS[0], "0", LAST_RUN_DURATION_MS))) - .isPositive(); - assertThat( - MetricsReporterFactoryForTests.gauge( - ImmutableList.of( - DUMMY_TASK_NAME, DUMMY_TABLE_NAME, TASKS[1], "1", LAST_RUN_DURATION_MS))) - .isGreaterThan(time); - assertThat( - MetricsReporterFactoryForTests.gauge( - ImmutableList.of( - DUMMY_TASK_NAME, DUMMY_TABLE_NAME, TASKS[2], "2", LAST_RUN_DURATION_MS))) - .isZero(); - } finally { - closeJobClient(jobClient); - } - } - - /** - * The test checks if the recovery watermark is only removed if the watermark has arrived from - * both upstream sources. - * - * @throws Exception if any - */ - @Test - void testRecovery() throws Exception { - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - ManualSource source1 = - new ManualSource<>(env, TypeInformation.of(TaskResult.class)); - ManualSource source2 = - new ManualSource<>(env, TypeInformation.of(TaskResult.class)); - source1 - .dataStream() - .union(source2.dataStream()) - .transform( - DUMMY_TASK_NAME, - TypeInformation.of(Void.class), - new LockRemover( - DUMMY_TABLE_NAME, new TestingLockFactory(), Lists.newArrayList(TASKS[0]))) - .setParallelism(1); - - JobClient jobClient = null; - try { - jobClient = env.executeAsync(); - RECOVERY_LOCK.tryLock(); - assertThat(RECOVERY_LOCK.isHeld()).isTrue(); - - processAndCheck(source1, new TaskResult(0, 0L, true, Lists.newArrayList())); - - source1.sendRecord(new TaskResult(0, 1L, true, Lists.newArrayList())); - // we receive the second result - this will not happen in real use cases, but with this we can - // be sure that the previous watermark is processed - Awaitility.await() - .until( - () -> - MetricsReporterFactoryForTests.counter( - ImmutableList.of( - DUMMY_TASK_NAME, - DUMMY_TABLE_NAME, - TASKS[0], - "0", - SUCCEEDED_TASK_COUNTER)) - .equals(2L)); - - // We did not remove the recovery lock, as no watermark received from the other source - assertThat(RECOVERY_LOCK.isHeld()).isTrue(); - - // Recovery arrives - source1.sendWatermark(10L); - source2.sendWatermark(10L); - - Awaitility.await().until(() -> !RECOVERY_LOCK.isHeld()); - } finally { - closeJobClient(jobClient); - } - } - - private void processAndCheck(ManualSource source, TaskResult input) { - processAndCheck(source, input, null); - } - - private void processAndCheck( - ManualSource source, TaskResult input, String counterPrefix) { - List counterKey = - ImmutableList.of( - (counterPrefix != null ? counterPrefix : "") + DUMMY_TASK_NAME, - DUMMY_TABLE_NAME, - TASKS[input.taskIndex()], - String.valueOf(input.taskIndex()), - input.success() ? SUCCEEDED_TASK_COUNTER : FAILED_TASK_COUNTER); - Long counterValue = MetricsReporterFactoryForTests.counter(counterKey); - Long expected = counterValue != null ? counterValue + 1 : 1L; - - source.sendRecord(input); - source.sendWatermark(input.startEpoch()); - - Awaitility.await() - .until(() -> expected.equals(MetricsReporterFactoryForTests.counter(counterKey))); - } - - private static class TestingLockFactory implements TriggerLockFactory { - - private boolean open = false; - - @Override - public void open() { - open = true; - } - - @Override - public Lock createLock() { - if (!open) { - throw new IllegalStateException("Lock factory not open"); - } - - return LOCK; - } - - @Override - public Lock createRecoveryLock() { - if (!open) { - throw new IllegalStateException("Lock factory not open"); - } - - return RECOVERY_LOCK; - } - - @Override - public void close() { - open = false; - } - } - - private static class TestingLock implements TriggerLockFactory.Lock { - private boolean locked = false; - - @Override - public boolean tryLock() { - if (isHeld()) { - return false; - } else { - locked = true; - return true; - } - } - - @Override - public boolean isHeld() { - return locked; - } - - @Override - public void unlock() { - locked = false; - } - } - - private static class SinkTest - implements Sink, - SupportsCommitter, - SupportsPostCommitTopology { - @Override - public SinkWriter createWriter(InitContext initContext) { - return new CommittingSinkWriter() { - private final Collection received = Lists.newArrayList(); - - @Override - public Collection prepareCommit() { - Collection result = Lists.newArrayList(received); - received.clear(); - return result; - } - - @Override - public void write(TaskResult taskResult, Context context) { - received.add(taskResult); - } - - @Override - public void flush(boolean b) { - // noop - } - - @Override - public void close() { - // noop - } - }; - } - - @Override - public Committer createCommitter(CommitterInitContext committerInitContext) { - return new Committer<>() { - @Override - public void commit(Collection> collection) { - // noop - } - - @Override - public void close() { - // noop - } - }; - } - - @Override - public SimpleVersionedSerializer getCommittableSerializer() { - return new SimpleVersionedSerializer<>() { - @Override - public int getVersion() { - return 0; - } - - @Override - public byte[] serialize(TaskResult taskResult) { - return new byte[0]; - } - - @Override - public TaskResult deserialize(int i, byte[] bytes) { - return null; - } - }; - } - - @Override - public void addPostCommitTopology(DataStream> committables) { - committables - .flatMap( - new FlatMapFunction, TaskResult>() { - @Override - public void flatMap( - CommittableMessage taskResultCommittableMessage, - Collector collector) { - if (taskResultCommittableMessage instanceof CommittableWithLineage) { - collector.collect( - ((CommittableWithLineage) taskResultCommittableMessage) - .getCommittable()); - } - } - }) - .transform( - DUMMY_TASK_NAME, - TypeInformation.of(Void.class), - new LockRemover( - DUMMY_TABLE_NAME, new TestingLockFactory(), Lists.newArrayList(TASKS[0]))); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java deleted file mode 100644 index c561c7054eae..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java +++ /dev/null @@ -1,347 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.File; -import java.io.IOException; -import java.time.Duration; -import java.util.List; -import java.util.concurrent.atomic.AtomicReference; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy; -import org.apache.flink.configuration.CheckpointingOptions; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.core.execution.JobClient; -import org.apache.flink.runtime.client.JobExecutionException; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.ContentFile; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.RewriteFiles; -import org.apache.iceberg.Table; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.ValueSource; - -class TestMonitorSource extends OperatorTestBase { - private static final TableChange EMPTY_EVENT = TableChange.empty(); - private static final RateLimiterStrategy HIGH_RATE = RateLimiterStrategy.perSecond(100.0); - private static final RateLimiterStrategy LOW_RATE = RateLimiterStrategy.perSecond(1.0 / 10000.0); - - @TempDir private File checkpointDir; - - @ParameterizedTest - @ValueSource(booleans = {true, false}) - void testChangeReaderIterator(boolean withDelete) throws IOException { - Table table = withDelete ? createTableWithDelete() : createTable(); - - MonitorSource.TableChangeIterator iterator = - new MonitorSource.TableChangeIterator(tableLoader(), null, Long.MAX_VALUE); - - // For an empty table we get an empty result - assertThat(iterator.next()).isEqualTo(EMPTY_EVENT); - - // Add a single commit and get back the commit data in the event - insert(table, 1, "a"); - TableChange expected = tableChangeWithLastSnapshot(table, TableChange.empty()); - assertThat(iterator.next()).isEqualTo(expected); - // Make sure that consecutive calls do not return the data again - assertThat(iterator.next()).isEqualTo(EMPTY_EVENT); - - // Add two more commits, but fetch the data in one loop - insert(table, 2, "b"); - expected = tableChangeWithLastSnapshot(table, TableChange.empty()); - - insert(table, 3, "c"); - expected = tableChangeWithLastSnapshot(table, expected); - - assertThat(iterator.next()).isEqualTo(expected); - // Make sure that consecutive calls do not return the data again - assertThat(iterator.next()).isEqualTo(EMPTY_EVENT); - } - - /** - * Create a table and check that the source returns the data as new commits arrive to the table. - */ - @Test - void testSource() throws Exception { - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - Table table = createTable(); - DataStream events = - env.fromSource( - new MonitorSource(tableLoader(), HIGH_RATE, Long.MAX_VALUE), - WatermarkStrategy.noWatermarks(), - "TableChangeSource") - .forceNonParallel(); - - // Sink to collect the results - CollectingSink result = new CollectingSink<>(); - events.sinkTo(result); - - JobClient jobClient = null; - try { - // First result is an empty event - jobClient = env.executeAsync("Table Change Source Test"); - assertThat(result.poll(Duration.ofSeconds(5L))).isEqualTo(EMPTY_EVENT); - - // Insert some data - File dataDir = new File(new Path(table.location(), "data").toUri().getPath()); - dataDir.mkdir(); - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(table, FileFormat.PARQUET, dataDir.toPath()); - List batch1 = RandomGenericData.generate(table.schema(), 2, 1); - dataAppender.appendToTable(batch1); - - // Wait until the changes are committed - Awaitility.await() - .until( - () -> { - table.refresh(); - return table.currentSnapshot() != null; - }); - - table.refresh(); - long size = firstFileLength(table); - - // Wait until the first non-empty event has arrived, and check the expected result - Awaitility.await() - .until( - () -> { - TableChange newEvent = result.poll(Duration.ofSeconds(5L)); - // Fetch every empty event from the beginning - while (newEvent.equals(EMPTY_EVENT)) { - newEvent = result.poll(Duration.ofSeconds(5L)); - } - - // The first non-empty event should contain the expected value - return newEvent.equals( - TableChange.builder() - .dataFileCount(1) - .dataFileSizeInBytes(size) - .commitCount(1) - .build()); - }); - } finally { - closeJobClient(jobClient); - } - } - - /** Check that the {@link MonitorSource} operator state is restored correctly. */ - @Test - void testStateRestore(@TempDir File savepointDir) throws Exception { - Table table = createTable(); - insert(table, 1, "a"); - TableLoader tableLoader = tableLoader(); - - Configuration config = new Configuration(); - config.set(CheckpointingOptions.CHECKPOINT_STORAGE, "filesystem"); - config.set(CheckpointingOptions.CHECKPOINTS_DIRECTORY, "file://" + checkpointDir.getPath()); - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(config); - env.enableCheckpointing(1000); - - DataStream events = - env.fromSource( - new MonitorSource(tableLoader, HIGH_RATE, Long.MAX_VALUE), - WatermarkStrategy.noWatermarks(), - "TableChangeSource") - .forceNonParallel(); - - // Sink to collect the results - CollectingSink result = new CollectingSink<>(); - events.sinkTo(result); - - // Start the job - Configuration conf; - JobClient jobClient = null; - AtomicReference firstNonEmptyEvent = new AtomicReference<>(); - try { - jobClient = env.executeAsync("Table Change Source Test"); - - Awaitility.await() - .until( - () -> { - TableChange newEvent = result.poll(Duration.ofSeconds(5L)); - // Fetch every empty event from the beginning - while (newEvent.equals(EMPTY_EVENT)) { - newEvent = result.poll(Duration.ofSeconds(5L)); - } - - // The first non-empty event should contain the expected value - firstNonEmptyEvent.set(newEvent); - return true; - }); - } finally { - // Stop with savepoint - conf = closeJobClient(jobClient, savepointDir); - } - - // Restore from savepoint, create the same topology with a different env - env = StreamExecutionEnvironment.getExecutionEnvironment(conf); - events = - env.fromSource( - new MonitorSource(tableLoader, LOW_RATE, Long.MAX_VALUE), - WatermarkStrategy.noWatermarks(), - "TableChangeSource") - .forceNonParallel(); - CollectingSink resultWithSavepoint = new CollectingSink<>(); - events.sinkTo(resultWithSavepoint); - - // Make sure that the job with restored source does not read new records from the table - JobClient clientWithSavepoint = null; - try { - clientWithSavepoint = env.executeAsync("Table Change Source test with savepoint"); - - assertThat(resultWithSavepoint.poll(Duration.ofSeconds(5L))).isEqualTo(EMPTY_EVENT); - } finally { - closeJobClient(clientWithSavepoint, null); - } - - // Restore without savepoint - env = StreamExecutionEnvironment.getExecutionEnvironment(); - events = - env.fromSource( - new MonitorSource(tableLoader, LOW_RATE, Long.MAX_VALUE), - WatermarkStrategy.noWatermarks(), - "TableChangeSource") - .forceNonParallel(); - CollectingSink resultWithoutSavepoint = new CollectingSink<>(); - events.sinkTo(resultWithoutSavepoint); - - // Make sure that a new job without state reads the event as expected - JobClient clientWithoutSavepoint = null; - try { - clientWithoutSavepoint = env.executeAsync("Table Change Source Test without savepoint"); - assertThat(resultWithoutSavepoint.poll(Duration.ofSeconds(5L))) - .isEqualTo(firstNonEmptyEvent.get()); - } finally { - closeJobClient(clientWithoutSavepoint); - } - } - - @Test - void testNotOneParallelismThrows() { - createTable(); - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - - env.fromSource( - new MonitorSource(tableLoader(), HIGH_RATE, Long.MAX_VALUE), - WatermarkStrategy.noWatermarks(), - "TableChangeSource") - .setParallelism(2) - .print(); - - assertThatThrownBy(env::execute) - .isInstanceOf(JobExecutionException.class) - .rootCause() - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Parallelism should be set to 1"); - } - - @Test - void testMaxReadBack() throws IOException { - Table table = createTable(); - insert(table, 1, "a"); - insert(table, 2, "b"); - insert(table, 3, "c"); - - TableLoader tableLoader = tableLoader(); - - MonitorSource.TableChangeIterator iterator = - new MonitorSource.TableChangeIterator(tableLoader, null, 1); - - // For a single maxReadBack we only get a single change - assertThat(iterator.next().commitCount()).isEqualTo(1); - - iterator = new MonitorSource.TableChangeIterator(tableLoader, null, 2); - - // Expecting 2 commits/snapshots for maxReadBack=2 - assertThat(iterator.next().commitCount()).isEqualTo(2); - - iterator = new MonitorSource.TableChangeIterator(tableLoader, null, Long.MAX_VALUE); - - // For maxReadBack Long.MAX_VALUE we get every change - assertThat(iterator.next().commitCount()).isEqualTo(3); - } - - @Test - void testSkipReplace() throws IOException { - Table table = createTable(); - insert(table, 1, "a"); - - TableLoader tableLoader = tableLoader(); - - MonitorSource.TableChangeIterator iterator = - new MonitorSource.TableChangeIterator(tableLoader, null, Long.MAX_VALUE); - - // Read the current snapshot - assertThat(iterator.next().commitCount()).isEqualTo(1); - - // Create a DataOperations.REPLACE snapshot - DataFile dataFile = - table.snapshots().iterator().next().addedDataFiles(table.io()).iterator().next(); - RewriteFiles rewrite = tableLoader.loadTable().newRewrite(); - // Replace the file with itself for testing purposes - rewrite.deleteFile(dataFile); - rewrite.addFile(dataFile); - rewrite.commit(); - - // Check that the rewrite is ignored - assertThat(iterator.next()).isEqualTo(EMPTY_EVENT); - } - - private static long firstFileLength(Table table) { - return table.currentSnapshot().addedDataFiles(table.io()).iterator().next().fileSizeInBytes(); - } - - private static TableChange tableChangeWithLastSnapshot(Table table, TableChange previous) { - List dataFiles = - Lists.newArrayList(table.currentSnapshot().addedDataFiles(table.io()).iterator()); - List deleteFiles = - Lists.newArrayList(table.currentSnapshot().addedDeleteFiles(table.io()).iterator()); - - long dataSize = dataFiles.stream().mapToLong(ContentFile::fileSizeInBytes).sum(); - long deleteRecordCount = deleteFiles.stream().mapToLong(DeleteFile::recordCount).sum(); - - TableChange newChange = previous.copy(); - newChange.merge( - TableChange.builder() - .dataFileCount(dataFiles.size()) - .dataFileSizeInBytes(dataSize) - // Currently we only test with equality deletes - .eqDeleteFileCount(deleteFiles.size()) - .eqDeleteRecordCount(deleteRecordCount) - .commitCount(1) - .build()); - return newChange; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestOrphanFilesDetector.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestOrphanFilesDetector.java deleted file mode 100644 index de3d01409b9d..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestOrphanFilesDetector.java +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.Map; -import java.util.concurrent.ConcurrentLinkedQueue; -import org.apache.flink.api.common.typeinfo.BasicTypeInfo; -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.streaming.util.KeyedTwoInputStreamOperatorTestHarness; -import org.apache.flink.streaming.util.ProcessFunctionTestHarnesses; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.actions.DeleteOrphanFiles.PrefixMismatchMode; -import org.apache.iceberg.actions.FileURI; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.flink.maintenance.api.DeleteOrphanFiles; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.Test; - -public class TestOrphanFilesDetector extends OperatorTestBase { - private static final Map EQUAL_SCHEMES = - Maps.newHashMap( - ImmutableMap.of( - "s3n", "s3", - "s3a", "s3")); - private static final Map EQUAL_AUTHORITIES = Maps.newHashMap(); - private static final String SCHEME_FILE_1 = "s3:/fileName1"; - private static final String AUTHORITY_FILE_1 = "s3://HDFS1002060/fileName1"; - private static final String ONE_AUTHORITY_SCHEME_FILE_1 = "s3a://HDFS1002060/fileName1"; - private static final String TWO_AUTHORITY_SCHEME_FILE_1 = "s3b://HDFS1002060/fileName1"; - - @Test - void testFileSystemFirst() throws Exception { - try (KeyedTwoInputStreamOperatorTestHarness testHarness = - testHarness()) { - testHarness.open(); - - testHarness.processElement2(SCHEME_FILE_1, EVENT_TIME); - testHarness.processElement2(SCHEME_FILE_1, EVENT_TIME); - testHarness.processWatermark1(WATERMARK); - testHarness.processElement1(SCHEME_FILE_1, EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - testHarness.processWatermark2(WATERMARK); - assertThat(testHarness.extractOutputValues()).isEmpty(); - assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); - } - } - - @Test - void testTableFirst() throws Exception { - try (KeyedTwoInputStreamOperatorTestHarness testHarness = - testHarness()) { - testHarness.open(); - - testHarness.processElement1(SCHEME_FILE_1, EVENT_TIME); - testHarness.processElement2(SCHEME_FILE_1, EVENT_TIME); - testHarness.processWatermark1(WATERMARK); - testHarness.processElement2(SCHEME_FILE_1, EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - testHarness.processWatermark2(WATERMARK); - assertThat(testHarness.extractOutputValues()).isEmpty(); - assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); - } - } - - @Test - void testOnlyFileSystem() throws Exception { - try (KeyedTwoInputStreamOperatorTestHarness testHarness = - testHarness()) { - testHarness.open(); - - testHarness.processElement2(SCHEME_FILE_1, EVENT_TIME); - testHarness.processElement2(SCHEME_FILE_1, EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - testHarness.processBothWatermarks(WATERMARK); - assertThat(testHarness.extractOutputValues()).isEqualTo(ImmutableList.of(SCHEME_FILE_1)); - assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); - } - } - - @Test - void testOnlyTable() throws Exception { - try (KeyedTwoInputStreamOperatorTestHarness testHarness = - testHarness()) { - testHarness.open(); - - testHarness.processElement1(SCHEME_FILE_1, EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - testHarness.processBothWatermarks(WATERMARK); - assertThat(testHarness.extractOutputValues()).isEmpty(); - assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); - } - } - - @Test - void testFileSystemWithAuthority() throws Exception { - try (KeyedTwoInputStreamOperatorTestHarness testHarness = - testHarness()) { - testHarness.open(); - - testHarness.processElement1(SCHEME_FILE_1, EVENT_TIME); - testHarness.processElement2(AUTHORITY_FILE_1, EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - testHarness.processBothWatermarks(WATERMARK); - assertThat(testHarness.extractOutputValues()).isEmpty(); - assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); - } - } - - @Test - void testTableWithAuthority() throws Exception { - try (KeyedTwoInputStreamOperatorTestHarness testHarness = - testHarness()) { - testHarness.open(); - - testHarness.processElement1(AUTHORITY_FILE_1, EVENT_TIME); - testHarness.processElement2(SCHEME_FILE_1, EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - testHarness.processBothWatermarks(WATERMARK); - ConcurrentLinkedQueue> errorList = - testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM); - assertThat(errorList).hasSize(1); - assertThat(errorList.stream().findFirst().get().getValue()) - .isInstanceOf(ValidationException.class); - - assertThat(testHarness.extractOutputValues()).isEmpty(); - } - } - - @Test - void testDiffScheme() throws Exception { - try (KeyedTwoInputStreamOperatorTestHarness testHarness = - testHarness()) { - testHarness.open(); - - testHarness.processElement1(AUTHORITY_FILE_1, EVENT_TIME); - testHarness.processElement2(ONE_AUTHORITY_SCHEME_FILE_1, EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - testHarness.processBothWatermarks(WATERMARK); - assertThat(testHarness.extractOutputValues()).isEmpty(); - assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); - } - } - - @Test - void testUnRegisterScheme() throws Exception { - try (KeyedTwoInputStreamOperatorTestHarness testHarness = - testHarness()) { - testHarness.open(); - - testHarness.processElement1(AUTHORITY_FILE_1, EVENT_TIME); - testHarness.processElement2(TWO_AUTHORITY_SCHEME_FILE_1, EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - testHarness.processBothWatermarks(WATERMARK); - ConcurrentLinkedQueue> errorList = - testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM); - assertThat(errorList).hasSize(1); - assertThat(errorList.stream().findFirst().get().getValue()) - .isInstanceOf(ValidationException.class); - - assertThat(testHarness.extractOutputValues()).isEmpty(); - } - } - - @Test - void testPrefixMismatchModeDelete() throws Exception { - try (KeyedTwoInputStreamOperatorTestHarness testHarness = - testHarness(PrefixMismatchMode.DELETE)) { - testHarness.open(); - - testHarness.processElement1(AUTHORITY_FILE_1, EVENT_TIME); - testHarness.processElement2(SCHEME_FILE_1, EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - testHarness.processBothWatermarks(WATERMARK); - assertThat(testHarness.extractOutputValues()).isEqualTo(ImmutableList.of(SCHEME_FILE_1)); - assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); - } - } - - @Test - void testPrefixMismatchModeIgnore() throws Exception { - try (KeyedTwoInputStreamOperatorTestHarness testHarness = - testHarness(PrefixMismatchMode.IGNORE)) { - testHarness.open(); - - testHarness.processElement1(AUTHORITY_FILE_1, EVENT_TIME); - testHarness.processElement2(SCHEME_FILE_1, EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - testHarness.processBothWatermarks(WATERMARK); - assertThat(testHarness.extractOutputValues()).isEmpty(); - assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); - } - } - - @Test - void testMultiAuthority() throws Exception { - try (KeyedTwoInputStreamOperatorTestHarness testHarness = - testHarness(PrefixMismatchMode.IGNORE)) { - testHarness.open(); - - testHarness.processElement1(TWO_AUTHORITY_SCHEME_FILE_1, EVENT_TIME); - testHarness.processElement1(ONE_AUTHORITY_SCHEME_FILE_1, EVENT_TIME); - testHarness.processElement2(AUTHORITY_FILE_1, EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - testHarness.processBothWatermarks(WATERMARK); - assertThat(testHarness.extractOutputValues()).isEmpty(); - assertThat(testHarness.getSideOutput(DeleteOrphanFiles.ERROR_STREAM)).isNull(); - } - } - - private static KeyedTwoInputStreamOperatorTestHarness testHarness( - PrefixMismatchMode prefixMismatchMode) throws Exception { - return ProcessFunctionTestHarnesses.forKeyedCoProcessFunction( - new OrphanFilesDetector(prefixMismatchMode, EQUAL_SCHEMES, EQUAL_AUTHORITIES), - (KeySelector) - t -> new FileURI(new Path(t).toUri(), EQUAL_SCHEMES, EQUAL_AUTHORITIES).getPath(), - (KeySelector) - t -> new FileURI(new Path(t).toUri(), EQUAL_SCHEMES, EQUAL_AUTHORITIES).getPath(), - BasicTypeInfo.STRING_TYPE_INFO); - } - - private static KeyedTwoInputStreamOperatorTestHarness - testHarness() throws Exception { - return testHarness(PrefixMismatchMode.ERROR); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestSkipOnError.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestSkipOnError.java deleted file mode 100644 index ce5b7ad82ac1..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestSkipOnError.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; -import org.apache.flink.streaming.util.TwoInputStreamOperatorTestHarness; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.ValueSource; - -class TestSkipOnError extends OperatorTestBase { - - private static final Exception EXCEPTION = new Exception("Test error"); - - @Test - void testNoFailure() throws Exception { - try (TwoInputStreamOperatorTestHarness testHarness = - new TwoInputStreamOperatorTestHarness<>(new SkipOnError())) { - testHarness.open(); - - testHarness.processElement1(FILE_NAME_1, EVENT_TIME); - testHarness.processElement1(FILE_NAME_2, EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - - testHarness.processBothWatermarks(WATERMARK); - assertThat(testHarness.extractOutputValues()) - .isEqualTo(ImmutableList.of(FILE_NAME_1, FILE_NAME_2)); - } - } - - @Test - void testFailure() throws Exception { - try (TwoInputStreamOperatorTestHarness testHarness = - new TwoInputStreamOperatorTestHarness<>(new SkipOnError())) { - testHarness.open(); - - testHarness.processElement1(FILE_NAME_1, EVENT_TIME); - testHarness.processElement2(EXCEPTION, EVENT_TIME); - testHarness.processElement1(FILE_NAME_2, EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - - testHarness.processBothWatermarks(WATERMARK); - assertThat(testHarness.extractOutputValues()).isEmpty(); - } - } - - @ParameterizedTest - @ValueSource(booleans = {true, false}) - void testStateRestore(boolean withError) throws Exception { - OperatorSubtaskState state; - try (TwoInputStreamOperatorTestHarness testHarness = - new TwoInputStreamOperatorTestHarness<>(new SkipOnError())) { - testHarness.open(); - - testHarness.processElement1(FILE_NAME_1, EVENT_TIME); - if (withError) { - testHarness.processElement2(EXCEPTION, EVENT_TIME); - } - - assertThat(testHarness.extractOutputValues()).isEmpty(); - state = testHarness.snapshot(1L, EVENT_TIME); - } - - try (TwoInputStreamOperatorTestHarness testHarness = - new TwoInputStreamOperatorTestHarness<>(new SkipOnError())) { - testHarness.initializeState(state); - testHarness.open(); - - testHarness.processElement1(FILE_NAME_2, EVENT_TIME); - - assertThat(testHarness.extractOutputValues()).isEmpty(); - testHarness.processBothWatermarks(WATERMARK); - if (withError) { - assertThat(testHarness.extractOutputValues()).isEmpty(); - } else { - assertThat(testHarness.extractOutputValues()) - .isEqualTo(ImmutableList.of(FILE_NAME_1, FILE_NAME_2)); - } - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTablePlanerAndReader.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTablePlanerAndReader.java deleted file mode 100644 index 87b0303b488d..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTablePlanerAndReader.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.List; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.streaming.util.ProcessFunctionTestHarnesses; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.MetadataTableType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.apache.iceberg.flink.source.ScanContext; -import org.junit.jupiter.api.Test; - -class TestTablePlanerAndReader extends OperatorTestBase { - private static final Schema FILE_PATH_SCHEMA = new Schema(DataFile.FILE_PATH); - private static final ScanContext FILE_PATH_SCAN_CONTEXT = - ScanContext.builder().streaming(true).project(FILE_PATH_SCHEMA).build(); - - @Test - void testTablePlaneAndRead() throws Exception { - Table table = createTable(); - insert(table, 1, "a"); - insert(table, 2, "b"); - List icebergSourceSplits; - try (OneInputStreamOperatorTestHarness testHarness = - ProcessFunctionTestHarnesses.forProcessFunction( - new MetadataTablePlanner( - OperatorTestBase.DUMMY_TASK_NAME, - 0, - tableLoader(), - FILE_PATH_SCAN_CONTEXT, - MetadataTableType.ALL_FILES, - 1))) { - testHarness.open(); - OperatorTestBase.trigger(testHarness); - icebergSourceSplits = testHarness.extractOutputValues(); - assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); - } - - try (OneInputStreamOperatorTestHarness testHarness = - ProcessFunctionTestHarnesses.forProcessFunction( - new FileNameReader( - OperatorTestBase.DUMMY_TASK_NAME, - 0, - tableLoader(), - FILE_PATH_SCHEMA, - FILE_PATH_SCAN_CONTEXT, - MetadataTableType.ALL_FILES))) { - testHarness.open(); - for (MetadataTablePlanner.SplitInfo icebergSourceSplit : icebergSourceSplits) { - testHarness.processElement(icebergSourceSplit, System.currentTimeMillis()); - } - - assertThat(testHarness.extractOutputValues()).hasSize(2); - assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); - } - } - - @Test - void testTablePlaneAndReadWithPartitionedTable() throws Exception { - Table table = createPartitionedTable(); - insertPartitioned(table, 1, "p1"); - insertPartitioned(table, 2, "p1"); - insertPartitioned(table, 3, "p2"); - insertPartitioned(table, 4, "p2"); - List icebergSourceSplits; - try (OneInputStreamOperatorTestHarness testHarness = - ProcessFunctionTestHarnesses.forProcessFunction( - new MetadataTablePlanner( - OperatorTestBase.DUMMY_TASK_NAME, - 0, - tableLoader(), - FILE_PATH_SCAN_CONTEXT, - MetadataTableType.ALL_FILES, - 1))) { - testHarness.open(); - OperatorTestBase.trigger(testHarness); - icebergSourceSplits = testHarness.extractOutputValues(); - assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); - } - - try (OneInputStreamOperatorTestHarness testHarness = - ProcessFunctionTestHarnesses.forProcessFunction( - new FileNameReader( - OperatorTestBase.DUMMY_TASK_NAME, - 0, - tableLoader(), - FILE_PATH_SCHEMA, - FILE_PATH_SCAN_CONTEXT, - MetadataTableType.ALL_FILES))) { - testHarness.open(); - for (MetadataTablePlanner.SplitInfo icebergSourceSplit : icebergSourceSplits) { - testHarness.processElement(icebergSourceSplit, System.currentTimeMillis()); - } - - assertThat(testHarness.extractOutputValues()).hasSize(4); - assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTaskResultAggregator.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTaskResultAggregator.java deleted file mode 100644 index 51d901e923c7..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTaskResultAggregator.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.List; -import java.util.concurrent.ConcurrentLinkedQueue; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.streaming.util.TwoInputStreamOperatorTestHarness; -import org.apache.iceberg.flink.maintenance.api.TaskResult; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.junit.jupiter.api.Test; - -class TestTaskResultAggregator extends OperatorTestBase { - - @Test - void testPassWatermark() throws Exception { - TaskResultAggregator taskResultAggregator = - new TaskResultAggregator("table-name", "task-name", 0); - try (TwoInputStreamOperatorTestHarness testHarness = - new TwoInputStreamOperatorTestHarness<>(taskResultAggregator)) { - testHarness.open(); - testHarness.processBothWatermarks(WATERMARK); - ConcurrentLinkedQueue output = testHarness.getOutput(); - assertThat(output).containsOnlyOnce(WATERMARK); - } - } - - @Test - void testProcessWatermarkWithoutElement() throws Exception { - TaskResultAggregator taskResultAggregator = - new TaskResultAggregator("table-name", "task-name", 0); - try (TwoInputStreamOperatorTestHarness testHarness = - new TwoInputStreamOperatorTestHarness<>(taskResultAggregator)) { - testHarness.open(); - testHarness.processBothWatermarks(WATERMARK); - List taskResults = testHarness.extractOutputValues(); - assertThat(taskResults).hasSize(0); - } - } - - @Test - void testProcessWatermark() throws Exception { - TaskResultAggregator taskResultAggregator = - new TaskResultAggregator("table-name", "task-name", 0); - try (TwoInputStreamOperatorTestHarness testHarness = - new TwoInputStreamOperatorTestHarness<>(taskResultAggregator)) { - testHarness.open(); - - testHarness.processElement1(new StreamRecord<>(Trigger.create(EVENT_TIME, 0))); - testHarness.processBothWatermarks(WATERMARK); - List taskResults = testHarness.extractOutputValues(); - assertThat(taskResults).hasSize(1); - TaskResult taskResult = taskResults.get(0); - assertThat(taskResult.taskIndex()).isEqualTo(0); - assertThat(taskResult.startEpoch()).isEqualTo(EVENT_TIME); - assertThat(taskResult.success()).isEqualTo(true); - assertThat(taskResult.exceptions()).hasSize(0); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTriggerManager.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTriggerManager.java deleted file mode 100644 index 63bea00f346e..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestTriggerManager.java +++ /dev/null @@ -1,671 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.maintenance.operator; - -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.CONCURRENT_RUN_THROTTLED; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.NOTHING_TO_TRIGGER; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.RATE_LIMITER_TRIGGERED; -import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.TRIGGERED; -import static org.assertj.core.api.Assertions.assertThat; - -import java.time.Duration; -import java.util.List; -import java.util.stream.Stream; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.common.typeinfo.Types; -import org.apache.flink.core.execution.JobClient; -import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.operators.KeyedProcessOperator; -import org.apache.flink.streaming.util.KeyedOneInputStreamOperatorTestHarness; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.maintenance.api.Trigger; -import org.apache.iceberg.flink.maintenance.api.TriggerLockFactory; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.Arguments; -import org.junit.jupiter.params.provider.MethodSource; - -class TestTriggerManager extends OperatorTestBase { - private static final long DELAY = 10L; - private static final String[] TASKS = new String[] {"task0", "task1"}; - private long processingTime = 0L; - private TriggerLockFactory.Lock lock; - private TriggerLockFactory.Lock recoveringLock; - private String tableName; - - @BeforeEach - void before() { - super.before(); - Table table = createTable(); - this.lock = LOCK_FACTORY.createLock(); - this.recoveringLock = LOCK_FACTORY.createRecoveryLock(); - this.tableName = table.name(); - } - - @Test - void testCommitCount() throws Exception { - TriggerManager manager = - manager(tableLoader(), new TriggerEvaluator.Builder().commitCount(3).build()); - try (KeyedOneInputStreamOperatorTestHarness testHarness = - harness(manager)) { - testHarness.open(); - - addEventAndCheckResult(testHarness, TableChange.builder().commitCount(1).build(), 0); - addEventAndCheckResult(testHarness, TableChange.builder().commitCount(2).build(), 1); - addEventAndCheckResult(testHarness, TableChange.builder().commitCount(3).build(), 2); - addEventAndCheckResult(testHarness, TableChange.builder().commitCount(10).build(), 3); - - // No trigger in this case - addEventAndCheckResult(testHarness, TableChange.builder().commitCount(1).build(), 3); - addEventAndCheckResult(testHarness, TableChange.builder().commitCount(1).build(), 3); - - addEventAndCheckResult(testHarness, TableChange.builder().commitCount(1).build(), 4); - } - } - - @Test - void testDataFileCount() throws Exception { - TriggerManager manager = - manager(tableLoader(), new TriggerEvaluator.Builder().dataFileCount(3).build()); - try (KeyedOneInputStreamOperatorTestHarness testHarness = - harness(manager)) { - testHarness.open(); - - addEventAndCheckResult(testHarness, TableChange.builder().dataFileCount(1).build(), 0); - - addEventAndCheckResult(testHarness, TableChange.builder().dataFileCount(2).build(), 1); - addEventAndCheckResult(testHarness, TableChange.builder().dataFileCount(3).build(), 2); - addEventAndCheckResult(testHarness, TableChange.builder().dataFileCount(5).build(), 3); - - // No trigger in this case - addEventAndCheckResult(testHarness, TableChange.builder().dataFileCount(1).build(), 3); - - addEventAndCheckResult(testHarness, TableChange.builder().dataFileCount(2).build(), 4); - } - } - - @Test - void testDataFileSizeInBytes() throws Exception { - TriggerManager manager = - manager(tableLoader(), new TriggerEvaluator.Builder().dataFileSizeInBytes(3).build()); - try (KeyedOneInputStreamOperatorTestHarness testHarness = - harness(manager)) { - testHarness.open(); - - addEventAndCheckResult(testHarness, TableChange.builder().dataFileSizeInBytes(1L).build(), 0); - addEventAndCheckResult(testHarness, TableChange.builder().dataFileSizeInBytes(2L).build(), 1); - addEventAndCheckResult(testHarness, TableChange.builder().dataFileSizeInBytes(5L).build(), 2); - - // No trigger in this case - addEventAndCheckResult(testHarness, TableChange.builder().dataFileSizeInBytes(1L).build(), 2); - - addEventAndCheckResult(testHarness, TableChange.builder().dataFileSizeInBytes(2L).build(), 3); - } - } - - @Test - void testPosDeleteFileCount() throws Exception { - TriggerManager manager = - manager(tableLoader(), new TriggerEvaluator.Builder().posDeleteFileCount(3).build()); - try (KeyedOneInputStreamOperatorTestHarness testHarness = - harness(manager)) { - testHarness.open(); - - addEventAndCheckResult(testHarness, TableChange.builder().posDeleteFileCount(1).build(), 0); - addEventAndCheckResult(testHarness, TableChange.builder().posDeleteFileCount(2).build(), 1); - addEventAndCheckResult(testHarness, TableChange.builder().posDeleteFileCount(3).build(), 2); - addEventAndCheckResult(testHarness, TableChange.builder().posDeleteFileCount(10).build(), 3); - - // No trigger in this case - addEventAndCheckResult(testHarness, TableChange.builder().posDeleteFileCount(1).build(), 3); - addEventAndCheckResult(testHarness, TableChange.builder().posDeleteFileCount(1).build(), 3); - - addEventAndCheckResult(testHarness, TableChange.builder().posDeleteFileCount(1).build(), 4); - } - } - - @Test - void testPosDeleteRecordCount() throws Exception { - TriggerManager manager = - manager(tableLoader(), new TriggerEvaluator.Builder().posDeleteRecordCount(3).build()); - try (KeyedOneInputStreamOperatorTestHarness testHarness = - harness(manager)) { - testHarness.open(); - - addEventAndCheckResult( - testHarness, TableChange.builder().posDeleteRecordCount(1L).build(), 0); - addEventAndCheckResult( - testHarness, TableChange.builder().posDeleteRecordCount(2L).build(), 1); - addEventAndCheckResult( - testHarness, TableChange.builder().posDeleteRecordCount(5L).build(), 2); - - // No trigger in this case - addEventAndCheckResult( - testHarness, TableChange.builder().posDeleteRecordCount(1L).build(), 2); - - addEventAndCheckResult( - testHarness, TableChange.builder().posDeleteRecordCount(2L).build(), 3); - } - } - - @Test - void testEqDeleteFileCount() throws Exception { - TriggerManager manager = - manager(tableLoader(), new TriggerEvaluator.Builder().eqDeleteFileCount(3).build()); - try (KeyedOneInputStreamOperatorTestHarness testHarness = - harness(manager)) { - testHarness.open(); - - addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteFileCount(1).build(), 0); - addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteFileCount(2).build(), 1); - addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteFileCount(3).build(), 2); - addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteFileCount(10).build(), 3); - - // No trigger in this case - addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteFileCount(1).build(), 3); - addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteFileCount(1).build(), 3); - - addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteFileCount(1).build(), 4); - } - } - - @Test - void testEqDeleteRecordCount() throws Exception { - TriggerManager manager = - manager(tableLoader(), new TriggerEvaluator.Builder().eqDeleteRecordCount(3).build()); - try (KeyedOneInputStreamOperatorTestHarness testHarness = - harness(manager)) { - testHarness.open(); - - addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteRecordCount(1L).build(), 0); - addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteRecordCount(2L).build(), 1); - addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteRecordCount(5L).build(), 2); - - // No trigger in this case - addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteRecordCount(1L).build(), 2); - - addEventAndCheckResult(testHarness, TableChange.builder().eqDeleteRecordCount(2L).build(), 3); - } - } - - @Test - void testTimeout() throws Exception { - TriggerManager manager = - manager( - tableLoader(), new TriggerEvaluator.Builder().timeout(Duration.ofSeconds(1)).build()); - try (KeyedOneInputStreamOperatorTestHarness testHarness = - harness(manager)) { - testHarness.open(); - - TableChange event = TableChange.builder().dataFileCount(1).commitCount(1).build(); - - // Wait for some time - testHarness.processElement(event, EVENT_TIME); - assertThat(testHarness.extractOutputValues()).isEmpty(); - - // Wait for the timeout to expire - long newTime = EVENT_TIME + Duration.ofSeconds(1).toMillis(); - testHarness.setProcessingTime(newTime); - testHarness.processElement(event, newTime); - assertThat(testHarness.extractOutputValues()).hasSize(1); - - // Remove the lock to allow the next trigger - lock.unlock(); - - // Send a new event - testHarness.setProcessingTime(newTime + 1); - testHarness.processElement(event, newTime); - - // No trigger yet - assertThat(testHarness.extractOutputValues()).hasSize(1); - - // Send a new event - newTime += Duration.ofSeconds(1).toMillis(); - testHarness.setProcessingTime(newTime); - testHarness.processElement(event, newTime); - - // New trigger should arrive - assertThat(testHarness.extractOutputValues()).hasSize(2); - } - } - - @Test - void testStateRestore() throws Exception { - TableLoader tableLoader = tableLoader(); - TriggerManager manager = manager(tableLoader); - OperatorSubtaskState state; - try (KeyedOneInputStreamOperatorTestHarness testHarness = - harness(manager)) { - testHarness.open(); - - testHarness.processElement( - TableChange.builder().dataFileCount(1).commitCount(1).build(), EVENT_TIME); - - assertThat(testHarness.extractOutputValues()).isEmpty(); - - state = testHarness.snapshot(1, EVENT_TIME); - } - - // Restore the state, write some more data, create a checkpoint, check the data which is written - manager = manager(tableLoader); - try (KeyedOneInputStreamOperatorTestHarness testHarness = - harness(manager)) { - testHarness.initializeState(state); - testHarness.open(); - - // Arrives the first real change which triggers the recovery process - testHarness.processElement(TableChange.builder().commitCount(1).build(), EVENT_TIME_2); - assertTriggers( - testHarness.extractOutputValues(), - Lists.newArrayList(Trigger.recovery(testHarness.getProcessingTime()))); - - // Remove the lock to allow the next trigger - recoveringLock.unlock(); - testHarness.setProcessingTime(EVENT_TIME_2); - // At this point the output contains the recovery trigger and the real trigger - assertThat(testHarness.extractOutputValues()).hasSize(2); - } - } - - @Test - void testNewJobReleasesExistingLock() throws Exception { - // Lock first to mock previous job orphaned lock - lock.tryLock(); - recoveringLock.tryLock(); - - TableLoader tableLoader = tableLoader(); - TriggerManager manager = manager(tableLoader); - try (KeyedOneInputStreamOperatorTestHarness testHarness = - harness(manager)) { - testHarness.open(); - - // Check the new job weather remove the orphaned lock - assertThat(lock.isHeld()).isFalse(); - assertThat(recoveringLock.isHeld()).isFalse(); - } - } - - @Test - void testMinFireDelay() throws Exception { - TableLoader tableLoader = tableLoader(); - TriggerManager manager = manager(tableLoader, DELAY, 1); - try (KeyedOneInputStreamOperatorTestHarness testHarness = - harness(manager)) { - testHarness.open(); - - addEventAndCheckResult(testHarness, TableChange.builder().commitCount(2).build(), 1); - long currentTime = testHarness.getProcessingTime(); - - // No new fire yet - addEventAndCheckResult(testHarness, TableChange.builder().commitCount(2).build(), 1); - - // Check that the trigger fired after the delay - testHarness.setProcessingTime(currentTime + DELAY); - assertThat(testHarness.extractOutputValues()).hasSize(2); - } - } - - @Test - void testLockCheckDelay() throws Exception { - TableLoader tableLoader = tableLoader(); - TriggerManager manager = manager(tableLoader, 1, DELAY); - try (KeyedOneInputStreamOperatorTestHarness testHarness = - harness(manager)) { - testHarness.open(); - - addEventAndCheckResult(testHarness, TableChange.builder().commitCount(2).build(), 1); - - // Create a lock to prevent execution, and check that there is no result - assertThat(lock.tryLock()).isTrue(); - addEventAndCheckResult(testHarness, TableChange.builder().commitCount(2).build(), 1); - long currentTime = testHarness.getProcessingTime(); - - // Remove the lock, and still no trigger - lock.unlock(); - assertThat(testHarness.extractOutputValues()).hasSize(1); - - // Check that the trigger fired after the delay - testHarness.setProcessingTime(currentTime + DELAY); - assertThat(testHarness.extractOutputValues()).hasSize(2); - } - } - - /** - * Simulating recovery scenarios where there is a leftover table lock, and ongoing maintenance - * task. - * - * @param locked if a lock exists on the table on job recovery - * @param runningTask is running and continues to run after job recovery - */ - @ParameterizedTest - @MethodSource("parametersForTestRecovery") - void testRecovery(boolean locked, boolean runningTask) throws Exception { - TableLoader tableLoader = tableLoader(); - TriggerManager manager = manager(tableLoader); - OperatorSubtaskState state; - try (KeyedOneInputStreamOperatorTestHarness testHarness = - harness(manager)) { - testHarness.open(); - state = testHarness.snapshot(1, EVENT_TIME); - } - - if (locked) { - assertThat(lock.tryLock()).isTrue(); - } - - manager = manager(tableLoader); - List expected = Lists.newArrayListWithExpectedSize(3); - try (KeyedOneInputStreamOperatorTestHarness testHarness = - harness(manager)) { - testHarness.initializeState(state); - testHarness.open(); - - ++processingTime; - expected.add(Trigger.recovery(processingTime)); - testHarness.setProcessingTime(processingTime); - testHarness.processElement(TableChange.builder().commitCount(2).build(), processingTime); - assertTriggers(testHarness.extractOutputValues(), expected); - - // Nothing happens until the recovery is finished - ++processingTime; - testHarness.setProcessingTime(processingTime); - assertTriggers(testHarness.extractOutputValues(), expected); - - if (runningTask) { - // Simulate the action of the recovered maintenance task lock removal when it finishes - lock.unlock(); - } - - // Still no results as the recovery is ongoing - ++processingTime; - testHarness.setProcessingTime(processingTime); - testHarness.processElement(TableChange.builder().commitCount(2).build(), processingTime); - assertTriggers(testHarness.extractOutputValues(), expected); - - // Simulate the action of removing lock and recoveryLock by downstream lock cleaner when it - // received recovery trigger - lock.unlock(); - recoveringLock.unlock(); - - // Emit only a single trigger - ++processingTime; - testHarness.setProcessingTime(processingTime); - // Releasing lock will create a new snapshot, and we receive this in the trigger - expected.add(Trigger.create(processingTime, 0)); - assertTriggers(testHarness.extractOutputValues(), expected); - } - } - - @Test - void testTriggerMetrics() throws Exception { - TableLoader tableLoader = tableLoader(); - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - ManualSource source = - new ManualSource<>(env, TypeInformation.of(TableChange.class)); - CollectingSink sink = new CollectingSink<>(); - - TriggerManager manager = - new TriggerManager( - tableLoader, - LOCK_FACTORY, - Lists.newArrayList(TASKS), - Lists.newArrayList( - new TriggerEvaluator.Builder().commitCount(2).build(), - new TriggerEvaluator.Builder().commitCount(4).build()), - 1L, - 1L); - source - .dataStream() - .keyBy(unused -> true) - .process(manager) - .name(DUMMY_TASK_NAME) - .forceNonParallel() - .sinkTo(sink); - - JobClient jobClient = null; - try { - jobClient = env.executeAsync(); - - // This one doesn't trigger - tests NOTHING_TO_TRIGGER - source.sendRecord(TableChange.builder().commitCount(1).build()); - - Awaitility.await() - .until( - () -> { - Long notingCounter = - MetricsReporterFactoryForTests.counter( - ImmutableList.of(DUMMY_TASK_NAME, tableName, NOTHING_TO_TRIGGER)); - return notingCounter != null && notingCounter.equals(1L); - }); - - // Trigger one of the tasks - tests TRIGGERED - source.sendRecord(TableChange.builder().commitCount(1).build()); - // Wait until we receive the trigger - assertThat(sink.poll(Duration.ofSeconds(5))).isNotNull(); - assertThat( - MetricsReporterFactoryForTests.counter( - ImmutableList.of(DUMMY_TASK_NAME, tableName, TASKS[0], "0", TRIGGERED))) - .isEqualTo(1L); - lock.unlock(); - - // Trigger both of the tasks - tests TRIGGERED - source.sendRecord(TableChange.builder().commitCount(2).build()); - // Wait until we receive the trigger - assertThat(sink.poll(Duration.ofSeconds(5))).isNotNull(); - lock.unlock(); - assertThat(sink.poll(Duration.ofSeconds(5))).isNotNull(); - lock.unlock(); - assertThat( - MetricsReporterFactoryForTests.counter( - ImmutableList.of(DUMMY_TASK_NAME, tableName, TASKS[0], "0", TRIGGERED))) - .isEqualTo(2L); - assertThat( - MetricsReporterFactoryForTests.counter( - ImmutableList.of(DUMMY_TASK_NAME, tableName, TASKS[1], "1", TRIGGERED))) - .isEqualTo(1L); - - // Final check all the counters - MetricsReporterFactoryForTests.assertCounters( - new ImmutableMap.Builder, Long>() - .put(ImmutableList.of(DUMMY_TASK_NAME, tableName, RATE_LIMITER_TRIGGERED), -1L) - .put(ImmutableList.of(DUMMY_TASK_NAME, tableName, CONCURRENT_RUN_THROTTLED), -1L) - .put(ImmutableList.of(DUMMY_TASK_NAME, tableName, TASKS[0], "0", TRIGGERED), 2L) - .put(ImmutableList.of(DUMMY_TASK_NAME, tableName, TASKS[1], "1", TRIGGERED), 1L) - .put(ImmutableList.of(DUMMY_TASK_NAME, tableName, NOTHING_TO_TRIGGER), 1L) - .build()); - } finally { - closeJobClient(jobClient); - } - } - - @Test - void testRateLimiterMetrics() throws Exception { - TableLoader tableLoader = tableLoader(); - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - ManualSource source = - new ManualSource<>(env, TypeInformation.of(TableChange.class)); - CollectingSink sink = new CollectingSink<>(); - - // High delay, so only triggered once - TriggerManager manager = manager(tableLoader, 1_000_000L, 1L); - source - .dataStream() - .keyBy(unused -> true) - .process(manager) - .name(DUMMY_TASK_NAME) - .forceNonParallel() - .sinkTo(sink); - - JobClient jobClient = null; - try { - jobClient = env.executeAsync(); - - // Start the first trigger - source.sendRecord(TableChange.builder().commitCount(2).build()); - assertThat(sink.poll(Duration.ofSeconds(5))).isNotNull(); - - // Remove the lock to allow the next trigger - lock.unlock(); - - // The second trigger will be blocked - source.sendRecord(TableChange.builder().commitCount(2).build()); - Awaitility.await() - .until( - () -> - MetricsReporterFactoryForTests.counter( - ImmutableList.of(DUMMY_TASK_NAME, tableName, RATE_LIMITER_TRIGGERED)) - .equals(1L)); - - // Final check all the counters - assertCounters(1L, 0L); - } finally { - closeJobClient(jobClient); - } - } - - @Test - void testConcurrentRunMetrics() throws Exception { - TableLoader tableLoader = tableLoader(); - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - ManualSource source = - new ManualSource<>(env, TypeInformation.of(TableChange.class)); - CollectingSink sink = new CollectingSink<>(); - - // High delay, so only triggered once - TriggerManager manager = manager(tableLoader, 1L, 1_000_000L); - source - .dataStream() - .keyBy(unused -> true) - .process(manager) - .name(DUMMY_TASK_NAME) - .forceNonParallel() - .sinkTo(sink); - - JobClient jobClient = null; - try { - jobClient = env.executeAsync(); - - // Start the first trigger - notice that we do not remove the lock after the trigger - source.sendRecord(TableChange.builder().commitCount(2).build()); - assertThat(sink.poll(Duration.ofSeconds(5))).isNotNull(); - - // The second trigger will be blocked by the lock - source.sendRecord(TableChange.builder().commitCount(2).build()); - Awaitility.await() - .until( - () -> - MetricsReporterFactoryForTests.counter( - ImmutableList.of(DUMMY_TASK_NAME, tableName, CONCURRENT_RUN_THROTTLED)) - .equals(1L)); - - // Final check all the counters - assertCounters(0L, 1L); - } finally { - closeJobClient(jobClient); - } - } - - private static Stream parametersForTestRecovery() { - return Stream.of( - Arguments.of(true, true), - Arguments.of(true, false), - Arguments.of(false, true), - Arguments.of(false, false)); - } - - private void assertCounters(long rateLimiterTrigger, long concurrentRunTrigger) { - MetricsReporterFactoryForTests.assertCounters( - new ImmutableMap.Builder, Long>() - .put( - ImmutableList.of(DUMMY_TASK_NAME, tableName, RATE_LIMITER_TRIGGERED), - rateLimiterTrigger) - .put( - ImmutableList.of(DUMMY_TASK_NAME, tableName, CONCURRENT_RUN_THROTTLED), - concurrentRunTrigger) - .put(ImmutableList.of(DUMMY_TASK_NAME, tableName, TASKS[0], "0", TRIGGERED), 1L) - .put(ImmutableList.of(DUMMY_TASK_NAME, tableName, NOTHING_TO_TRIGGER), 0L) - .build()); - } - - private KeyedOneInputStreamOperatorTestHarness harness( - TriggerManager manager) throws Exception { - return new KeyedOneInputStreamOperatorTestHarness<>( - new KeyedProcessOperator<>(manager), value -> true, Types.BOOLEAN); - } - - private void addEventAndCheckResult( - OneInputStreamOperatorTestHarness testHarness, - TableChange event, - int expectedSize) - throws Exception { - ++processingTime; - testHarness.setProcessingTime(processingTime); - testHarness.processElement(event, processingTime); - assertThat(testHarness.extractOutputValues()).hasSize(expectedSize); - // Remove the lock to allow the next trigger - lock.unlock(); - } - - private TriggerManager manager(TableLoader tableLoader, TriggerEvaluator evaluator) { - return new TriggerManager( - tableLoader, - LOCK_FACTORY, - Lists.newArrayList(TASKS[0]), - Lists.newArrayList(evaluator), - 1, - 1); - } - - private TriggerManager manager( - TableLoader tableLoader, long minFireDelayMs, long lockCheckDelayMs) { - return new TriggerManager( - tableLoader, - LOCK_FACTORY, - Lists.newArrayList(TASKS[0]), - Lists.newArrayList(new TriggerEvaluator.Builder().commitCount(2).build()), - minFireDelayMs, - lockCheckDelayMs); - } - - private TriggerManager manager(TableLoader tableLoader) { - return manager(tableLoader, new TriggerEvaluator.Builder().commitCount(2).build()); - } - - private static void assertTriggers(List expected, List actual) { - assertThat(actual).hasSize(expected.size()); - for (int i = 0; i < expected.size(); ++i) { - Trigger expectedTrigger = expected.get(i); - Trigger actualTrigger = actual.get(i); - assertThat(actualTrigger.timestamp()).isEqualTo(expectedTrigger.timestamp()); - assertThat(actualTrigger.taskId()).isEqualTo(expectedTrigger.taskId()); - assertThat(actualTrigger.isRecovery()).isEqualTo(expectedTrigger.isRecovery()); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/SinkTestUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/SinkTestUtil.java deleted file mode 100644 index 1cf55bcdc817..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/SinkTestUtil.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.Collection; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; -import org.apache.flink.streaming.api.connector.sink2.CommittableSummary; -import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; -import org.apache.flink.streaming.runtime.streamrecord.StreamElement; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; - -class SinkTestUtil { - - private SinkTestUtil() {} - - @SuppressWarnings("unchecked") - static List transformsToStreamElement(Collection elements) { - return elements.stream() - .map( - element -> { - if (element instanceof StreamRecord) { - return new StreamRecord<>( - ((StreamRecord>) element).getValue()); - } - return (StreamElement) element; - }) - .collect(Collectors.toList()); - } - - static CommittableSummary extractAndAssertCommittableSummary(StreamElement element) { - final Object value = element.asRecord().getValue(); - assertThat(value).isInstanceOf(CommittableSummary.class); - return (CommittableSummary) value; - } - - static CommittableWithLineage extractAndAssertCommittableWithLineage( - StreamElement element) { - final Object value = element.asRecord().getValue(); - assertThat(value).isInstanceOf(CommittableWithLineage.class); - return (CommittableWithLineage) value; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java deleted file mode 100644 index 44eb907a17aa..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.flink.AvroGenericRecordConverterBase; -import org.apache.iceberg.flink.DataGenerator; - -public class TestAvroGenericRecordToRowDataMapper extends AvroGenericRecordConverterBase { - @Override - protected void testConverter(DataGenerator dataGenerator) throws Exception { - // Need to use avroSchema from DataGenerator because some primitive types have special Avro - // type handling. Hence the Avro schema converted from Iceberg schema won't work. - AvroGenericRecordToRowDataMapper mapper = - AvroGenericRecordToRowDataMapper.forAvroSchema(dataGenerator.avroSchema()); - RowData expected = dataGenerator.generateFlinkRowData(); - RowData actual = mapper.map(dataGenerator.generateAvroGenericRecord()); - assertThat(actual).isEqualTo(expected); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java deleted file mode 100644 index abac605f81fd..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatExceptionOfType; - -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.sink.TestBucketPartitionerUtil.TableSchemaType; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.EnumSource; - -public class TestBucketPartitionKeySelector { - - @ParameterizedTest - @EnumSource( - value = TableSchemaType.class, - names = {"ONE_BUCKET", "IDENTITY_AND_BUCKET"}) - public void testCorrectKeySelection(TableSchemaType tableSchemaType) { - int numBuckets = 60; - - PartitionSpec partitionSpec = tableSchemaType.getPartitionSpec(numBuckets); - BucketPartitionKeySelector keySelector = - new BucketPartitionKeySelector( - partitionSpec, SimpleDataUtil.SCHEMA, SimpleDataUtil.ROW_TYPE); - - TestBucketPartitionerUtil.generateRowsForBucketIdRange(2, numBuckets) - .forEach( - rowData -> { - int expectedBucketId = - TestBucketPartitionerUtil.computeBucketId( - numBuckets, rowData.getString(1).toString()); - Integer key = keySelector.getKey(rowData); - assertThat(key).isEqualTo(expectedBucketId); - }); - } - - @Test - public void testKeySelectorMultipleBucketsFail() { - PartitionSpec partitionSpec = TableSchemaType.TWO_BUCKETS.getPartitionSpec(1); - - assertThatExceptionOfType(RuntimeException.class) - .isThrownBy( - () -> - new BucketPartitionKeySelector( - partitionSpec, SimpleDataUtil.SCHEMA, SimpleDataUtil.ROW_TYPE)) - .withMessage(BucketPartitionerUtil.BAD_NUMBER_OF_BUCKETS_ERROR_MESSAGE, 2); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java deleted file mode 100644 index 59bdba578ebb..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.sink.BucketPartitioner.BUCKET_GREATER_THAN_UPPER_BOUND_MESSAGE; -import static org.apache.iceberg.flink.sink.BucketPartitioner.BUCKET_LESS_THAN_LOWER_BOUND_MESSAGE; -import static org.apache.iceberg.flink.sink.BucketPartitioner.BUCKET_NULL_MESSAGE; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatExceptionOfType; - -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.flink.sink.TestBucketPartitionerUtil.TableSchemaType; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.CsvSource; - -public class TestBucketPartitioner { - - static final int DEFAULT_NUM_BUCKETS = 60; - - @ParameterizedTest - @CsvSource({"ONE_BUCKET,50", "IDENTITY_AND_BUCKET,50", "ONE_BUCKET,60", "IDENTITY_AND_BUCKET,60"}) - public void testPartitioningParallelismGreaterThanBuckets( - String schemaTypeStr, String numBucketsStr) { - int numPartitions = 500; - TableSchemaType tableSchemaType = TableSchemaType.valueOf(schemaTypeStr); - int numBuckets = Integer.parseInt(numBucketsStr); - PartitionSpec partitionSpec = tableSchemaType.getPartitionSpec(numBuckets); - BucketPartitioner bucketPartitioner = new BucketPartitioner(partitionSpec); - - int bucketId = 0; - for (int expectedIndex = 0; expectedIndex < numPartitions; expectedIndex++) { - int actualPartitionIndex = bucketPartitioner.partition(bucketId, numPartitions); - assertThat(actualPartitionIndex).isEqualTo(expectedIndex); - bucketId++; - if (bucketId == numBuckets) { - bucketId = 0; - } - } - } - - @ParameterizedTest - @CsvSource({"ONE_BUCKET,50", "IDENTITY_AND_BUCKET,50", "ONE_BUCKET,60", "IDENTITY_AND_BUCKET,60"}) - public void testPartitioningParallelismEqualLessThanBuckets( - String schemaTypeStr, String numBucketsStr) { - int numPartitions = 30; - TableSchemaType tableSchemaType = TableSchemaType.valueOf(schemaTypeStr); - int numBuckets = Integer.parseInt(numBucketsStr); - PartitionSpec partitionSpec = tableSchemaType.getPartitionSpec(numBuckets); - BucketPartitioner bucketPartitioner = new BucketPartitioner(partitionSpec); - - for (int bucketId = 0; bucketId < numBuckets; bucketId++) { - int actualPartitionIndex = bucketPartitioner.partition(bucketId, numPartitions); - assertThat(actualPartitionIndex).isEqualTo(bucketId % numPartitions); - } - } - - @Test - public void testPartitionerBucketIdNullFail() { - PartitionSpec partitionSpec = TableSchemaType.ONE_BUCKET.getPartitionSpec(DEFAULT_NUM_BUCKETS); - BucketPartitioner bucketPartitioner = new BucketPartitioner(partitionSpec); - - assertThatExceptionOfType(RuntimeException.class) - .isThrownBy(() -> bucketPartitioner.partition(null, DEFAULT_NUM_BUCKETS)) - .withMessage(BUCKET_NULL_MESSAGE); - } - - @Test - public void testPartitionerMultipleBucketsFail() { - PartitionSpec partitionSpec = TableSchemaType.TWO_BUCKETS.getPartitionSpec(DEFAULT_NUM_BUCKETS); - - assertThatExceptionOfType(RuntimeException.class) - .isThrownBy(() -> new BucketPartitioner(partitionSpec)) - .withMessage(BucketPartitionerUtil.BAD_NUMBER_OF_BUCKETS_ERROR_MESSAGE, 2); - } - - @Test - public void testPartitionerBucketIdOutOfRangeFail() { - PartitionSpec partitionSpec = TableSchemaType.ONE_BUCKET.getPartitionSpec(DEFAULT_NUM_BUCKETS); - BucketPartitioner bucketPartitioner = new BucketPartitioner(partitionSpec); - - int negativeBucketId = -1; - assertThatExceptionOfType(IllegalArgumentException.class) - .isThrownBy(() -> bucketPartitioner.partition(negativeBucketId, 1)) - .withMessage(BUCKET_LESS_THAN_LOWER_BOUND_MESSAGE, negativeBucketId); - - int tooBigBucketId = DEFAULT_NUM_BUCKETS; - assertThatExceptionOfType(IllegalArgumentException.class) - .isThrownBy(() -> bucketPartitioner.partition(tooBigBucketId, 1)) - .withMessage(BUCKET_GREATER_THAN_UPPER_BOUND_MESSAGE, tooBigBucketId, DEFAULT_NUM_BUCKETS); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java deleted file mode 100644 index caf0ac6f21d8..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; -import static org.apache.iceberg.flink.TestFixtures.DATABASE; -import static org.apache.iceberg.flink.TestFixtures.TABLE_IDENTIFIER; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.java.typeutils.RowTypeInfo; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.util.DataFormatConverters; -import org.apache.flink.table.runtime.typeutils.ExternalTypeInfo; -import org.apache.flink.table.types.DataType; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.sink.TestBucketPartitionerUtil.TableSchemaType; -import org.apache.iceberg.flink.source.BoundedTestSource; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.EnumSource; - -public class TestBucketPartitionerFlinkIcebergSink { - - private static final int NUMBER_TASK_MANAGERS = 1; - private static final int SLOTS_PER_TASK_MANAGER = 8; - - @RegisterExtension - private static final MiniClusterExtension MINI_CLUSTER_EXTENSION = - new MiniClusterExtension( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(NUMBER_TASK_MANAGERS) - .setNumberSlotsPerTaskManager(SLOTS_PER_TASK_MANAGER) - .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) - .build()); - - @RegisterExtension - private static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); - - private static final TypeInformation ROW_TYPE_INFO = - new RowTypeInfo( - SimpleDataUtil.FLINK_SCHEMA.getColumnDataTypes().stream() - .map(ExternalTypeInfo::of) - .toArray(TypeInformation[]::new)); - - // Parallelism = 8 (parallelism > numBuckets) throughout the test suite - private final int parallelism = NUMBER_TASK_MANAGERS * SLOTS_PER_TASK_MANAGER; - private final FileFormat format = FileFormat.PARQUET; - private final int numBuckets = 4; - - private Table table; - private StreamExecutionEnvironment env; - private TableLoader tableLoader; - - private void setupEnvironment(TableSchemaType tableSchemaType) { - PartitionSpec partitionSpec = tableSchemaType.getPartitionSpec(numBuckets); - table = - CATALOG_EXTENSION - .catalog() - .createTable( - TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - partitionSpec, - ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); - env = - StreamExecutionEnvironment.getExecutionEnvironment(DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(parallelism * 2); - tableLoader = CATALOG_EXTENSION.tableLoader(); - } - - private void appendRowsToTable(List allRows) throws Exception { - DataFormatConverters.RowConverter converter = - new DataFormatConverters.RowConverter( - SimpleDataUtil.FLINK_SCHEMA.getColumnDataTypes().toArray(DataType[]::new)); - - DataStream dataStream = - env.addSource( - new BoundedTestSource<>( - allRows.stream().map(converter::toExternal).toArray(Row[]::new)), - ROW_TYPE_INFO) - .map(converter::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)) - .partitionCustom( - new BucketPartitioner(table.spec()), - new BucketPartitionKeySelector( - table.spec(), - table.schema(), - FlinkSink.toFlinkRowType(table.schema(), SimpleDataUtil.FLINK_SCHEMA))); - - FlinkSink.forRowData(dataStream) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .distributionMode(DistributionMode.NONE) - .append(); - - env.execute("Test Iceberg DataStream"); - - SimpleDataUtil.assertTableRows(table, allRows); - } - - @ParameterizedTest - @EnumSource( - value = TableSchemaType.class, - names = {"ONE_BUCKET", "IDENTITY_AND_BUCKET"}) - public void testSendRecordsToAllBucketsEvenly(TableSchemaType tableSchemaType) throws Exception { - setupEnvironment(tableSchemaType); - List rows = generateTestDataRows(); - - appendRowsToTable(rows); - TableTestStats stats = extractPartitionResults(tableSchemaType); - - assertThat(stats.totalRowCount).isEqualTo(rows.size()); - // All 4 buckets should've been written to - assertThat(stats.writersPerBucket).hasSize(numBuckets); - assertThat(stats.numFilesPerBucket).hasSize(numBuckets); - // Writer expectation (2 writers per bucket): - // - Bucket0 -> Writers [0, 4] - // - Bucket1 -> Writers [1, 5] - // - Bucket2 -> Writers [2, 6] - // - Bucket3 -> Writers [3, 7] - for (int i = 0, j = numBuckets; i < numBuckets; i++, j++) { - assertThat(stats.writersPerBucket.get(i)).hasSameElementsAs(Arrays.asList(i, j)); - // 2 files per bucket (one file is created by each writer) - assertThat(stats.numFilesPerBucket.get(i)).isEqualTo(2); - // 2 rows per file (total of 16 rows across 8 files) - assertThat(stats.rowsPerWriter.get(i)).isEqualTo(2); - } - } - - /** - * Generating 16 rows to be sent uniformly to all writers (round-robin across 8 writers -> 4 - * buckets) - */ - private List generateTestDataRows() { - int totalNumRows = parallelism * 2; - int numRowsPerBucket = totalNumRows / numBuckets; - return TestBucketPartitionerUtil.generateRowsForBucketIdRange(numRowsPerBucket, numBuckets); - } - - private TableTestStats extractPartitionResults(TableSchemaType tableSchemaType) - throws IOException { - int totalRecordCount = 0; - Map> writersPerBucket = Maps.newHashMap(); // > - Map filesPerBucket = Maps.newHashMap(); // - Map rowsPerWriter = Maps.newHashMap(); // - - try (CloseableIterable fileScanTasks = table.newScan().planFiles()) { - for (FileScanTask scanTask : fileScanTasks) { - long recordCountInFile = scanTask.file().recordCount(); - - String[] splitFilePath = scanTask.file().location().split("/"); - // Filename example: 00007-0-a7d3a29a-33e9-4740-88f4-0f494397d60c-00001.parquet - // Writer ID: .......^^^^^ - String filename = splitFilePath[splitFilePath.length - 1]; - int writerId = Integer.parseInt(filename.split("-")[0]); - - totalRecordCount += recordCountInFile; - int bucketId = - scanTask - .file() - .partition() - .get(tableSchemaType.bucketPartitionColumnPosition(), Integer.class); - writersPerBucket.computeIfAbsent(bucketId, k -> Lists.newArrayList()); - writersPerBucket.get(bucketId).add(writerId); - filesPerBucket.put(bucketId, filesPerBucket.getOrDefault(bucketId, 0) + 1); - rowsPerWriter.put(writerId, rowsPerWriter.getOrDefault(writerId, 0L) + recordCountInFile); - } - } - - return new TableTestStats(totalRecordCount, writersPerBucket, filesPerBucket, rowsPerWriter); - } - - /** DTO to hold Test Stats */ - private static class TableTestStats { - final int totalRowCount; - final Map> writersPerBucket; - final Map numFilesPerBucket; - final Map rowsPerWriter; - - TableTestStats( - int totalRecordCount, - Map> writersPerBucket, - Map numFilesPerBucket, - Map rowsPerWriter) { - this.totalRowCount = totalRecordCount; - this.writersPerBucket = writersPerBucket; - this.numFilesPerBucket = numFilesPerBucket; - this.rowsPerWriter = rowsPerWriter; - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java deleted file mode 100644 index e1309bfac6d5..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import java.util.UUID; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.BucketUtil; - -final class TestBucketPartitionerUtil { - - enum TableSchemaType { - ONE_BUCKET { - @Override - public int bucketPartitionColumnPosition() { - return 0; - } - - @Override - public PartitionSpec getPartitionSpec(int numBuckets) { - return PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("data", numBuckets).build(); - } - }, - IDENTITY_AND_BUCKET { - @Override - public int bucketPartitionColumnPosition() { - return 1; - } - - @Override - public PartitionSpec getPartitionSpec(int numBuckets) { - return PartitionSpec.builderFor(SimpleDataUtil.SCHEMA) - .identity("id") - .bucket("data", numBuckets) - .build(); - } - }, - TWO_BUCKETS { - @Override - public int bucketPartitionColumnPosition() { - return 1; - } - - @Override - public PartitionSpec getPartitionSpec(int numBuckets) { - return PartitionSpec.builderFor(SimpleDataUtil.SCHEMA) - .bucket("id", numBuckets) - .bucket("data", numBuckets) - .build(); - } - }; - - public abstract int bucketPartitionColumnPosition(); - - public abstract PartitionSpec getPartitionSpec(int numBuckets); - } - - private TestBucketPartitionerUtil() {} - - /** - * Utility method to generate rows whose values will "hash" to a range of bucketIds (from 0 to - * numBuckets - 1) - * - * @param numRowsPerBucket how many different rows should be generated per bucket - * @param numBuckets max number of buckets to consider - * @return the list of rows whose data "hashes" to the desired bucketId - */ - static List generateRowsForBucketIdRange(int numRowsPerBucket, int numBuckets) { - List rows = Lists.newArrayListWithCapacity(numBuckets * numRowsPerBucket); - // For some of our tests, this order of the generated rows matters - for (int i = 0; i < numRowsPerBucket; i++) { - for (int bucketId = 0; bucketId < numBuckets; bucketId++) { - String value = generateValueForBucketId(bucketId, numBuckets); - rows.add(GenericRowData.of(1, StringData.fromString(value))); - } - } - return rows; - } - - /** - * Utility method to generate a UUID string that will "hash" to a desired bucketId - * - * @param bucketId the desired bucketId - * @return the string data that "hashes" to the desired bucketId - */ - private static String generateValueForBucketId(int bucketId, int numBuckets) { - while (true) { - String uuid = UUID.randomUUID().toString(); - if (computeBucketId(numBuckets, uuid) == bucketId) { - return uuid; - } - } - } - - /** - * Utility that performs the same hashing/bucketing mechanism used by Bucket.java - * - * @param numBuckets max number of buckets to consider - * @param value the string to compute the bucketId from - * @return the computed bucketId - */ - static int computeBucketId(int numBuckets, String value) { - return (BucketUtil.hash(value) & Integer.MAX_VALUE) % numBuckets; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java deleted file mode 100644 index 360db658cd2f..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -import java.time.Duration; -import java.util.concurrent.TimeUnit; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.Test; - -public class TestCachingTableSupplier { - - @Test - public void testCheckArguments() { - SerializableTable initialTable = mock(SerializableTable.class); - - Table loadedTable = mock(Table.class); - TableLoader tableLoader = mock(TableLoader.class); - when(tableLoader.loadTable()).thenReturn(loadedTable); - - new CachingTableSupplier(initialTable, tableLoader, Duration.ofMillis(100)); - - assertThatThrownBy(() -> new CachingTableSupplier(initialTable, tableLoader, null)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("tableRefreshInterval cannot be null"); - assertThatThrownBy(() -> new CachingTableSupplier(null, tableLoader, Duration.ofMillis(100))) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("initialTable cannot be null"); - assertThatThrownBy(() -> new CachingTableSupplier(initialTable, null, Duration.ofMillis(100))) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("tableLoader cannot be null"); - } - - @Test - public void testTableReload() { - SerializableTable initialTable = mock(SerializableTable.class); - - Table loadedTable = mock(Table.class); - TableLoader tableLoader = mock(TableLoader.class); - when(tableLoader.loadTable()).thenReturn(loadedTable); - - CachingTableSupplier cachingTableSupplier = - new CachingTableSupplier(initialTable, tableLoader, Duration.ofMillis(100)); - - // refresh shouldn't do anything as the min reload interval hasn't passed - cachingTableSupplier.refreshTable(); - assertThat(cachingTableSupplier.get()).isEqualTo(initialTable); - - // refresh after waiting past the min reload interval - Awaitility.await() - .atLeast(100, TimeUnit.MILLISECONDS) - .untilAsserted( - () -> { - cachingTableSupplier.refreshTable(); - assertThat(cachingTableSupplier.get()).isEqualTo(loadedTable); - }); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCommittableToTableChangeConverter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCommittableToTableChangeConverter.java deleted file mode 100644 index 0c7a47c23230..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCommittableToTableChangeConverter.java +++ /dev/null @@ -1,319 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import org.apache.flink.api.java.tuple.Tuple2; -import org.apache.flink.core.io.SimpleVersionedSerialization; -import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; -import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.streaming.util.ProcessFunctionTestHarnesses; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileMetadata; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.maintenance.operator.TableChange; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -class TestCommittableToTableChangeConverter { - @TempDir private File tempDir; - private Table table; - private FileIO fileIO; - private String tableName; - private Map specs; - private DataFile dataFile; - private DataFile dataFile1; - private DeleteFile posDeleteFile; - private DeleteFile eqDeleteFile; - - @BeforeEach - public void before() throws Exception { - String warehouse = tempDir.getAbsolutePath(); - - String tablePath = warehouse.concat("/test"); - assertThat(new File(tablePath).mkdir()).as("Should create the table path correctly.").isTrue(); - table = SimpleDataUtil.createTable(tablePath, Maps.newHashMap(), false); - fileIO = table.io(); - tableName = table.name(); - specs = table.specs(); - dataFile = - DataFiles.builder(table.spec()) - .withPath("/path/to/data.parquet") - .withFileSizeInBytes(100) - .withRecordCount(10) - .build(); - dataFile1 = - DataFiles.builder(table.spec()) - .withPath("/path/to/data1.parquet") - .withFileSizeInBytes(101) - .withRecordCount(11) - .build(); - posDeleteFile = - FileMetadata.deleteFileBuilder(table.spec()) - .ofPositionDeletes() - .withPath("/path/to/pos-deletes.parquet") - .withFileSizeInBytes(50) - .withRecordCount(5) - .build(); - eqDeleteFile = - FileMetadata.deleteFileBuilder(table.spec()) - .ofEqualityDeletes(1) - .withPath("/path/to/eq-deletes.parquet") - .withFileSizeInBytes(30) - .withRecordCount(3) - .build(); - } - - @Test - public void testConvertWriteResultToTableChange() throws Exception { - String flinkJobId = newFlinkJobId(); - String operatorId = newOperatorUniqueId(); - ManifestOutputFileFactory factory = - FlinkManifestUtil.createOutputFileFactory( - () -> table, table.properties(), flinkJobId, operatorId, 1, 1); - - try (OneInputStreamOperatorTestHarness, TableChange> - harness = - ProcessFunctionTestHarnesses.forProcessFunction( - new CommittableToTableChangeConverter(fileIO, tableName, specs))) { - harness.open(); - WriteResult writeResult = - WriteResult.builder() - .addDataFiles(dataFile) - .addDeleteFiles(posDeleteFile, eqDeleteFile) - .build(); - DeltaManifests deltaManifests = - FlinkManifestUtil.writeCompletedFiles(writeResult, () -> factory.create(1), table.spec()); - IcebergCommittable committable = - new IcebergCommittable( - SimpleVersionedSerialization.writeVersionAndSerialize( - DeltaManifestsSerializer.INSTANCE, deltaManifests), - flinkJobId, - operatorId, - 1L); - CommittableWithLineage message = - new CommittableWithLineage<>(committable, 1L, 0); - harness.processElement(new StreamRecord<>(message)); - TableChange tableChange = harness.extractOutputValues().get(0); - TableChange expectedTableChange = - TableChange.builder() - .dataFileCount(1) - .dataFileSizeInBytes(100) - .posDeleteFileCount(1) - .posDeleteRecordCount(5) - .eqDeleteFileCount(1) - .eqDeleteRecordCount(3) - .commitCount(1) - .build(); - - assertThat(tableChange).isEqualTo(expectedTableChange); - } - } - - @Test - public void testConvertReplays() throws Exception { - String flinkJobId = newFlinkJobId(); - String operatorId = newOperatorUniqueId(); - ManifestOutputFileFactory factory = - FlinkManifestUtil.createOutputFileFactory( - () -> table, table.properties(), flinkJobId, operatorId, 1, 1); - - try (OneInputStreamOperatorTestHarness, TableChange> - harness = - ProcessFunctionTestHarnesses.forProcessFunction( - new CommittableToTableChangeConverter(fileIO, tableName, specs))) { - harness.open(); - - Tuple2, DeltaManifests> icebergCommittable = - createIcebergCommittable( - dataFile, posDeleteFile, eqDeleteFile, factory, table, flinkJobId, operatorId, 1L); - harness.processElement(new StreamRecord<>(icebergCommittable.f0)); - // Duplicate data should be handled properly to avoid job failure. - harness.processElement(new StreamRecord<>(icebergCommittable.f0)); - List tableChanges = harness.extractOutputValues(); - assertThat(tableChanges).hasSize(1); - TableChange tableChange = tableChanges.get(0); - TableChange expectedTableChange = - TableChange.builder() - .dataFileCount(1) - .dataFileSizeInBytes(100) - .posDeleteFileCount(1) - .posDeleteRecordCount(5) - .eqDeleteFileCount(1) - .eqDeleteRecordCount(3) - .commitCount(1) - .build(); - - assertThat(tableChange).isEqualTo(expectedTableChange); - } - } - - @Test - public void testReadUnExistManifest() throws Exception { - String flinkJobId = newFlinkJobId(); - String operatorId = newOperatorUniqueId(); - ManifestOutputFileFactory factory = - FlinkManifestUtil.createOutputFileFactory( - () -> table, table.properties(), flinkJobId, operatorId, 1, 1); - - try (OneInputStreamOperatorTestHarness, TableChange> - harness = - ProcessFunctionTestHarnesses.forProcessFunction( - new CommittableToTableChangeConverter(fileIO, tableName, specs))) { - harness.open(); - - Tuple2, DeltaManifests> icebergCommittable = - createIcebergCommittable( - dataFile, posDeleteFile, eqDeleteFile, factory, table, flinkJobId, operatorId, 1L); - - for (ManifestFile manifest : icebergCommittable.f1.manifests()) { - fileIO.deleteFile(manifest.path()); - // check Manifest files are deleted - assertThat(new File(manifest.path())).doesNotExist(); - } - - // Emit the same committable to check read no exist manifest - // should be handled properly to avoid job failure. - harness.processElement(new StreamRecord<>(icebergCommittable.f0)); - - Tuple2, DeltaManifests> icebergCommittable1 = - createIcebergCommittable( - dataFile1, posDeleteFile, eqDeleteFile, factory, table, flinkJobId, operatorId, 1L); - - harness.processElement(new StreamRecord<>(icebergCommittable1.f0)); - - List tableChanges = harness.extractOutputValues(); - assertThat(tableChanges).hasSize(1); - TableChange tableChange = tableChanges.get(0); - TableChange expectedTableChange = - TableChange.builder() - .dataFileCount(1) - .dataFileSizeInBytes(101) - .posDeleteFileCount(1) - .posDeleteRecordCount(5) - .eqDeleteFileCount(1) - .eqDeleteRecordCount(3) - .commitCount(1) - .build(); - - assertThat(tableChange).isEqualTo(expectedTableChange); - } - } - - @Test - public void testEmptyCommit() throws Exception { - try (OneInputStreamOperatorTestHarness, TableChange> - harness = - ProcessFunctionTestHarnesses.forProcessFunction( - new CommittableToTableChangeConverter(fileIO, tableName, specs))) { - - harness.open(); - IcebergCommittable emptyCommittable = - new IcebergCommittable(new byte[0], "jobId", "operatorId", 1L); - CommittableWithLineage message = - new CommittableWithLineage<>(emptyCommittable, 1L, 0); - harness.processElement(new StreamRecord<>(message)); - List tableChanges = harness.extractOutputValues(); - assertThat(tableChanges).hasSize(0); - } - } - - @Test - public void testManifestDeletion() throws Exception { - String flinkJobId = newFlinkJobId(); - String operatorId = newOperatorUniqueId(); - ManifestOutputFileFactory factory = - FlinkManifestUtil.createOutputFileFactory( - () -> table, table.properties(), flinkJobId, operatorId, 1, 1); - - try (OneInputStreamOperatorTestHarness, TableChange> - harness = - ProcessFunctionTestHarnesses.forProcessFunction( - new CommittableToTableChangeConverter(fileIO, tableName, specs))) { - - harness.open(); - - Tuple2, DeltaManifests> icebergCommittable = - createIcebergCommittable( - dataFile, posDeleteFile, eqDeleteFile, factory, table, flinkJobId, operatorId, 1L); - - harness.processElement(new StreamRecord<>(icebergCommittable.f0)); - - // check Manifest files are deleted - for (ManifestFile manifest : icebergCommittable.f1.manifests()) { - assertThat(new File(manifest.path())).doesNotExist(); - } - } - } - - private static Tuple2, DeltaManifests> - createIcebergCommittable( - DataFile dataFile, - DeleteFile posDeleteFile, - DeleteFile eqDeleteFile, - ManifestOutputFileFactory factory, - Table table, - String flinkJobId, - String operatorId, - long checkpointId) - throws IOException { - WriteResult writeResult = - WriteResult.builder() - .addDataFiles(dataFile) - .addDeleteFiles(posDeleteFile, eqDeleteFile) - .build(); - DeltaManifests deltaManifests = - FlinkManifestUtil.writeCompletedFiles( - writeResult, () -> factory.create(checkpointId), table.spec()); - - IcebergCommittable committable = - new IcebergCommittable( - SimpleVersionedSerialization.writeVersionAndSerialize( - DeltaManifestsSerializer.INSTANCE, deltaManifests), - flinkJobId, - operatorId, - checkpointId); - return Tuple2.of(new CommittableWithLineage<>(committable, checkpointId, 0), deltaManifests); - } - - private static String newFlinkJobId() { - return UUID.randomUUID().toString(); - } - - private static String newOperatorUniqueId() { - return UUID.randomUUID().toString(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java deleted file mode 100644 index 5a74db5713a5..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java +++ /dev/null @@ -1,258 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Map; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.common.DynFields; -import org.apache.iceberg.flink.FlinkWriteConf; -import org.apache.iceberg.flink.FlinkWriteOptions; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.BaseTaskWriter; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestCompressionSettings { - @TempDir protected Path temporaryFolder; - - private Table table; - - @Parameter(index = 0) - private Map initProperties; - - @Parameters(name = "tableProperties = {0}") - public static Object[][] parameters() { - return new Object[][] { - new Object[] {ImmutableMap.of()}, - new Object[] { - ImmutableMap.of( - TableProperties.AVRO_COMPRESSION, - "zstd", - TableProperties.AVRO_COMPRESSION_LEVEL, - "3", - TableProperties.PARQUET_COMPRESSION, - "zstd", - TableProperties.PARQUET_COMPRESSION_LEVEL, - "3", - TableProperties.ORC_COMPRESSION, - "zstd", - TableProperties.ORC_COMPRESSION_STRATEGY, - "compression") - } - }; - } - - @BeforeEach - public void before() throws IOException { - File folder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); - table = SimpleDataUtil.createTable(folder.getAbsolutePath(), initProperties, false); - } - - @TestTemplate - public void testCompressionAvro() throws Exception { - // No override provided - Map resultProperties = - appenderProperties( - table, - SimpleDataUtil.FLINK_SCHEMA, - ImmutableMap.of(FlinkWriteOptions.WRITE_FORMAT.key(), "AVRO")); - - if (initProperties.get(TableProperties.AVRO_COMPRESSION) == null) { - assertThat(resultProperties) - .containsEntry(TableProperties.AVRO_COMPRESSION, TableProperties.AVRO_COMPRESSION_DEFAULT) - .doesNotContainKey(TableProperties.AVRO_COMPRESSION_LEVEL); - } else { - assertThat(resultProperties) - .containsEntry( - TableProperties.AVRO_COMPRESSION, - initProperties.get(TableProperties.AVRO_COMPRESSION)) - .containsEntry( - TableProperties.AVRO_COMPRESSION_LEVEL, - initProperties.get(TableProperties.AVRO_COMPRESSION_LEVEL)); - } - - // Override compression to snappy and some random level - resultProperties = - appenderProperties( - table, - SimpleDataUtil.FLINK_SCHEMA, - ImmutableMap.of( - FlinkWriteOptions.WRITE_FORMAT.key(), - "AVRO", - FlinkWriteOptions.COMPRESSION_CODEC.key(), - "snappy", - FlinkWriteOptions.COMPRESSION_LEVEL.key(), - "6")); - - assertThat(resultProperties) - .containsEntry(TableProperties.AVRO_COMPRESSION, "snappy") - .containsEntry(TableProperties.AVRO_COMPRESSION_LEVEL, "6"); - } - - @TestTemplate - public void testCompressionParquet() throws Exception { - // No override provided - Map resultProperties = - appenderProperties( - table, - SimpleDataUtil.FLINK_SCHEMA, - ImmutableMap.of(FlinkWriteOptions.WRITE_FORMAT.key(), "PARQUET")); - - if (initProperties.get(TableProperties.PARQUET_COMPRESSION) == null) { - assertThat(resultProperties) - .containsEntry( - TableProperties.PARQUET_COMPRESSION, - TableProperties.PARQUET_COMPRESSION_DEFAULT_SINCE_1_4_0) - .doesNotContainKey(TableProperties.PARQUET_COMPRESSION_LEVEL); - } else { - assertThat(resultProperties) - .containsEntry( - TableProperties.PARQUET_COMPRESSION, - initProperties.get(TableProperties.PARQUET_COMPRESSION)) - .containsEntry( - TableProperties.PARQUET_COMPRESSION_LEVEL, - initProperties.get(TableProperties.PARQUET_COMPRESSION_LEVEL)); - } - - // Override compression to snappy and some random level - resultProperties = - appenderProperties( - table, - SimpleDataUtil.FLINK_SCHEMA, - ImmutableMap.of( - FlinkWriteOptions.WRITE_FORMAT.key(), - "PARQUET", - FlinkWriteOptions.COMPRESSION_CODEC.key(), - "snappy", - FlinkWriteOptions.COMPRESSION_LEVEL.key(), - "6")); - - assertThat(resultProperties) - .containsEntry(TableProperties.PARQUET_COMPRESSION, "snappy") - .containsEntry(TableProperties.PARQUET_COMPRESSION_LEVEL, "6"); - } - - @TestTemplate - public void testCompressionOrc() throws Exception { - // No override provided - Map resultProperties = - appenderProperties( - table, - SimpleDataUtil.FLINK_SCHEMA, - ImmutableMap.of(FlinkWriteOptions.WRITE_FORMAT.key(), "ORC")); - - if (initProperties.get(TableProperties.ORC_COMPRESSION) == null) { - assertThat(resultProperties) - .containsEntry(TableProperties.ORC_COMPRESSION, TableProperties.ORC_COMPRESSION_DEFAULT) - .containsEntry( - TableProperties.ORC_COMPRESSION_STRATEGY, - TableProperties.ORC_COMPRESSION_STRATEGY_DEFAULT); - } else { - assertThat(resultProperties) - .containsEntry( - TableProperties.ORC_COMPRESSION, initProperties.get(TableProperties.ORC_COMPRESSION)) - .containsEntry( - TableProperties.ORC_COMPRESSION_STRATEGY, - initProperties.get(TableProperties.ORC_COMPRESSION_STRATEGY)); - } - - // Override compression to snappy and a different strategy - resultProperties = - appenderProperties( - table, - SimpleDataUtil.FLINK_SCHEMA, - ImmutableMap.of( - FlinkWriteOptions.WRITE_FORMAT.key(), - "ORC", - FlinkWriteOptions.COMPRESSION_CODEC.key(), - "snappy", - FlinkWriteOptions.COMPRESSION_STRATEGY.key(), - "speed")); - - assertThat(resultProperties) - .containsEntry(TableProperties.ORC_COMPRESSION, "snappy") - .containsEntry(TableProperties.ORC_COMPRESSION_STRATEGY, "speed"); - } - - private static OneInputStreamOperatorTestHarness - createIcebergStreamWriter( - Table icebergTable, ResolvedSchema flinkSchema, Map override) - throws Exception { - RowType flinkRowType = FlinkSink.toFlinkRowType(icebergTable.schema(), flinkSchema); - FlinkWriteConf flinkWriteConfig = - new FlinkWriteConf( - icebergTable, override, new org.apache.flink.configuration.Configuration()); - - IcebergStreamWriter streamWriter = - FlinkSink.createStreamWriter(() -> icebergTable, flinkWriteConfig, flinkRowType, null); - OneInputStreamOperatorTestHarness harness = - new OneInputStreamOperatorTestHarness<>(streamWriter, 1, 1, 0); - - harness.setup(); - harness.open(); - - return harness; - } - - private static Map appenderProperties( - Table table, ResolvedSchema schema, Map override) throws Exception { - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter(table, schema, override)) { - testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); - - testHarness.prepareSnapshotPreBarrier(1L); - DynFields.BoundField operatorField = - DynFields.builder() - .hiddenImpl(testHarness.getOperatorFactory().getClass(), "operator") - .build(testHarness.getOperatorFactory()); - DynFields.BoundField writerField = - DynFields.builder() - .hiddenImpl(IcebergStreamWriter.class, "writer") - .build(operatorField.get()); - DynFields.BoundField appenderField = - DynFields.builder() - .hiddenImpl(BaseTaskWriter.class, "appenderFactory") - .build(writerField.get()); - DynFields.BoundField> propsField = - DynFields.builder() - .hiddenImpl(FlinkAppenderFactory.class, "props") - .build(appenderField.get()); - return propsField.get(); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java deleted file mode 100644 index a21c51c378af..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java +++ /dev/null @@ -1,428 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.SimpleDataUtil.createDelete; -import static org.apache.iceberg.flink.SimpleDataUtil.createInsert; -import static org.apache.iceberg.flink.SimpleDataUtil.createRecord; -import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateAfter; -import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateBefore; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.time.OffsetDateTime; -import java.util.Arrays; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.runtime.typeutils.RowDataSerializer; -import org.apache.flink.table.types.logical.IntType; -import org.apache.flink.table.types.logical.LocalZonedTimestampType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.ContentFile; -import org.apache.iceberg.FileContent; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.RowDelta; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.TestBase; -import org.apache.iceberg.TestTables; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.StructLikeSet; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestDeltaTaskWriter extends TestBase { - - @Parameter(index = 1) - private FileFormat format; - - @Parameters(name = "formatVersion = {0}, fileFormat = {1}") - protected static List parameters() { - return Arrays.asList( - new Object[] {2, FileFormat.AVRO}, - new Object[] {2, FileFormat.ORC}, - new Object[] {2, FileFormat.PARQUET}); - } - - @Override - @BeforeEach - public void setupTable() throws IOException { - this.metadataDir = new File(tableDir, "metadata"); - } - - private int idFieldId() { - return table.schema().findField("id").fieldId(); - } - - private int dataFieldId() { - return table.schema().findField("data").fieldId(); - } - - private void testCdcEvents(boolean partitioned) throws IOException { - Set equalityFieldIds = Sets.newHashSet(idFieldId()); - TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); - taskWriterFactory.initialize(1, 1); - - // Start the 1th transaction. - TaskWriter writer = taskWriterFactory.create(); - - writer.write(createInsert(1, "aaa")); - writer.write(createInsert(2, "bbb")); - writer.write(createInsert(3, "ccc")); - - // Update <2, 'bbb'> to <2, 'ddd'> - writer.write(createUpdateBefore(2, "bbb")); // 1 pos-delete and 1 eq-delete. - writer.write(createUpdateAfter(2, "ddd")); - - // Update <1, 'aaa'> to <1, 'eee'> - writer.write(createUpdateBefore(1, "aaa")); // 1 pos-delete and 1 eq-delete. - writer.write(createUpdateAfter(1, "eee")); - - // Insert <4, 'fff'> - writer.write(createInsert(4, "fff")); - // Insert <5, 'ggg'> - writer.write(createInsert(5, "ggg")); - - // Delete <3, 'ccc'> - writer.write(createDelete(3, "ccc")); // 1 pos-delete and 1 eq-delete. - - WriteResult result = writer.complete(); - assertThat(result.dataFiles()).hasSize(partitioned ? 7 : 1); - assertThat(result.deleteFiles()).hasSize(partitioned ? 3 : 1); - commitTransaction(result); - - assertThat(actualRowSet("*")) - .isEqualTo( - expectedRowSet( - createRecord(1, "eee"), - createRecord(2, "ddd"), - createRecord(4, "fff"), - createRecord(5, "ggg"))); - - // Start the 2nd transaction. - writer = taskWriterFactory.create(); - - // Update <2, 'ddd'> to <6, 'hhh'> - (Update both key and value) - writer.write(createUpdateBefore(2, "ddd")); // 1 eq-delete - writer.write(createUpdateAfter(6, "hhh")); - - // Update <5, 'ggg'> to <5, 'iii'> - writer.write(createUpdateBefore(5, "ggg")); // 1 eq-delete - writer.write(createUpdateAfter(5, "iii")); - - // Delete <4, 'fff'> - writer.write(createDelete(4, "fff")); // 1 eq-delete. - - result = writer.complete(); - assertThat(result.dataFiles()).hasSize(partitioned ? 2 : 1); - assertThat(result.deleteFiles()).hasSize(partitioned ? 3 : 1); - commitTransaction(result); - - assertThat(actualRowSet("*")) - .isEqualTo( - expectedRowSet(createRecord(1, "eee"), createRecord(5, "iii"), createRecord(6, "hhh"))); - } - - @TestTemplate - public void testUnpartitioned() throws IOException { - createAndInitTable(false); - testCdcEvents(false); - } - - @TestTemplate - public void testPartitioned() throws IOException { - createAndInitTable(true); - testCdcEvents(true); - } - - private void testWritePureEqDeletes(boolean partitioned) throws IOException { - createAndInitTable(partitioned); - Set equalityFieldIds = Sets.newHashSet(idFieldId()); - TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); - taskWriterFactory.initialize(1, 1); - - TaskWriter writer = taskWriterFactory.create(); - writer.write(createDelete(1, "aaa")); - writer.write(createDelete(2, "bbb")); - writer.write(createDelete(3, "ccc")); - - WriteResult result = writer.complete(); - assertThat(result.dataFiles()).isEmpty(); - assertThat(result.deleteFiles()).hasSize(partitioned ? 3 : 1); - commitTransaction(result); - - assertThat(actualRowSet("*")).isEqualTo(expectedRowSet()); - } - - @TestTemplate - public void testUnpartitionedPureEqDeletes() throws IOException { - testWritePureEqDeletes(false); - } - - @TestTemplate - public void testPartitionedPureEqDeletes() throws IOException { - testWritePureEqDeletes(true); - } - - private void testAbort(boolean partitioned) throws IOException { - createAndInitTable(partitioned); - Set equalityFieldIds = Sets.newHashSet(idFieldId()); - TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); - taskWriterFactory.initialize(1, 1); - - TaskWriter writer = taskWriterFactory.create(); - for (int i = 0; i < 8_000; i += 2) { - writer.write(createUpdateBefore(i + 1, "aaa")); - writer.write(createUpdateAfter(i + 1, "aaa")); - - writer.write(createUpdateBefore(i + 2, "bbb")); - writer.write(createUpdateAfter(i + 2, "bbb")); - } - - // Assert the current data/delete file count. - List files = - Files.walk(Paths.get(tableDir.getPath(), "data")) - .filter(p -> p.toFile().isFile()) - .filter(p -> !p.toString().endsWith(".crc")) - .collect(Collectors.toList()); - assertThat(files).hasSize(partitioned ? 4 : 2); - - writer.abort(); - for (Path file : files) { - assertThat(file).doesNotExist(); - } - } - - @TestTemplate - public void testUnpartitionedAbort() throws IOException { - testAbort(false); - } - - @TestTemplate - public void testPartitionedAbort() throws IOException { - testAbort(true); - } - - @TestTemplate - public void testPartitionedTableWithDataAsKey() throws IOException { - createAndInitTable(true); - Set equalityFieldIds = Sets.newHashSet(dataFieldId()); - TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); - taskWriterFactory.initialize(1, 1); - - // Start the 1th transaction. - TaskWriter writer = taskWriterFactory.create(); - writer.write(createInsert(1, "aaa")); - writer.write(createInsert(2, "aaa")); - writer.write(createInsert(3, "bbb")); - writer.write(createInsert(4, "ccc")); - - WriteResult result = writer.complete(); - assertThat(result.dataFiles()).hasSize(3); - assertThat(result.deleteFiles()).hasSize(1); - commitTransaction(result); - - assertThat(actualRowSet("*")) - .isEqualTo( - expectedRowSet(createRecord(2, "aaa"), createRecord(3, "bbb"), createRecord(4, "ccc"))); - - // Start the 2nd transaction. - writer = taskWriterFactory.create(); - writer.write(createInsert(5, "aaa")); - writer.write(createInsert(6, "bbb")); - writer.write(createDelete(7, "ccc")); // 1 eq-delete. - - result = writer.complete(); - assertThat(result.dataFiles()).hasSize(2); - assertThat(result.deleteFiles()).hasSize(1); - commitTransaction(result); - - assertThat(actualRowSet("*")) - .isEqualTo( - expectedRowSet( - createRecord(2, "aaa"), - createRecord(5, "aaa"), - createRecord(3, "bbb"), - createRecord(6, "bbb"))); - } - - @TestTemplate - public void testPartitionedTableWithDataAndIdAsKey() throws IOException { - createAndInitTable(true); - Set equalityFieldIds = Sets.newHashSet(dataFieldId(), idFieldId()); - TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); - taskWriterFactory.initialize(1, 1); - - TaskWriter writer = taskWriterFactory.create(); - writer.write(createInsert(1, "aaa")); - writer.write(createInsert(2, "aaa")); - - writer.write(createDelete(2, "aaa")); // 1 pos-delete. - - WriteResult result = writer.complete(); - assertThat(result.dataFiles()).hasSize(1); - assertThat(result.deleteFiles()).hasSize(1); - assertThat(result.deleteFiles()[0].content()).isEqualTo(FileContent.POSITION_DELETES); - commitTransaction(result); - - assertThat(actualRowSet("*")).isEqualTo(expectedRowSet(createRecord(1, "aaa"))); - } - - @TestTemplate - public void testEqualityColumnOnCustomPrecisionTSColumn() throws IOException { - Schema tableSchema = - new Schema( - required(3, "id", Types.IntegerType.get()), - required(4, "ts", Types.TimestampType.withZone())); - RowType flinkType = - new RowType( - false, - ImmutableList.of( - new RowType.RowField("id", new IntType()), - new RowType.RowField("ts", new LocalZonedTimestampType(3)))); - - this.table = create(tableSchema, PartitionSpec.unpartitioned()); - initTable(table); - - Set equalityIds = ImmutableSet.of(table.schema().findField("ts").fieldId()); - TaskWriterFactory taskWriterFactory = createTaskWriterFactory(flinkType, equalityIds); - taskWriterFactory.initialize(1, 1); - - TaskWriter writer = taskWriterFactory.create(); - RowDataSerializer serializer = new RowDataSerializer(flinkType); - OffsetDateTime start = OffsetDateTime.now(); - writer.write( - serializer.toBinaryRow( - GenericRowData.ofKind( - RowKind.INSERT, 1, TimestampData.fromInstant(start.toInstant())))); - writer.write( - serializer.toBinaryRow( - GenericRowData.ofKind( - RowKind.INSERT, 2, TimestampData.fromInstant(start.plusSeconds(1).toInstant())))); - writer.write( - serializer.toBinaryRow( - GenericRowData.ofKind( - RowKind.DELETE, 2, TimestampData.fromInstant(start.plusSeconds(1).toInstant())))); - - WriteResult result = writer.complete(); - // One data file - assertThat(result.dataFiles()).hasSize(1); - // One eq delete file + one pos delete file - assertThat(result.deleteFiles()).hasSize(2); - assertThat( - Arrays.stream(result.deleteFiles()) - .map(ContentFile::content) - .collect(Collectors.toSet())) - .isEqualTo(Sets.newHashSet(FileContent.POSITION_DELETES, FileContent.EQUALITY_DELETES)); - commitTransaction(result); - - Record expectedRecord = GenericRecord.create(tableSchema); - expectedRecord.setField("id", 1); - int cutPrecisionNano = start.getNano() / 1000000 * 1000000; - expectedRecord.setField("ts", start.withNano(cutPrecisionNano)); - - assertThat(actualRowSet("*")).isEqualTo(expectedRowSet(expectedRecord)); - } - - private void commitTransaction(WriteResult result) { - RowDelta rowDelta = table.newRowDelta(); - Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows); - Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); - rowDelta - .validateDeletedFiles() - .validateDataFilesExist(Lists.newArrayList(result.referencedDataFiles())) - .commit(); - } - - private StructLikeSet expectedRowSet(Record... records) { - return SimpleDataUtil.expectedRowSet(table, records); - } - - private StructLikeSet actualRowSet(String... columns) throws IOException { - return SimpleDataUtil.actualRowSet(table, columns); - } - - private TaskWriterFactory createTaskWriterFactory(Set equalityFieldIds) { - return new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - FlinkSchemaUtil.convert(table.schema()), - 128 * 1024 * 1024, - format, - table.properties(), - equalityFieldIds, - false); - } - - private TaskWriterFactory createTaskWriterFactory( - RowType flinkType, Set equalityFieldIds) { - return new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - flinkType, - 128 * 1024 * 1024, - format, - table.properties(), - equalityFieldIds, - true); - } - - private void createAndInitTable(boolean partitioned) { - if (partitioned) { - this.table = create(SCHEMA, PartitionSpec.builderFor(SCHEMA).identity("data").build()); - } else { - this.table = create(SCHEMA, PartitionSpec.unpartitioned()); - } - - initTable(table); - } - - private void initTable(TestTables.TestTable testTable) { - testTable - .updateProperties() - .set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, String.valueOf(8 * 1024)) - .defaultFormat(format) - .commit(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java deleted file mode 100644 index dd89f43483b0..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.TestAppenderFactory; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.iceberg.util.StructLikeSet; - -public class TestFlinkAppenderFactory extends TestAppenderFactory { - - private final RowType rowType = FlinkSchemaUtil.convert(SCHEMA); - - @Override - protected FileAppenderFactory createAppenderFactory( - List equalityFieldIds, Schema eqDeleteSchema, Schema posDeleteRowSchema) { - return new FlinkAppenderFactory( - table, - table.schema(), - rowType, - table.properties(), - table.spec(), - ArrayUtil.toIntArray(equalityFieldIds), - eqDeleteSchema, - posDeleteRowSchema); - } - - @Override - protected RowData createRow(Integer id, String data) { - return SimpleDataUtil.createRowData(id, data); - } - - @Override - protected StructLikeSet expectedRowSet(Iterable rows) { - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - for (RowData row : rows) { - RowDataWrapper wrapper = new RowDataWrapper(rowType, table.schema().asStruct()); - set.add(wrapper.wrap(row)); - } - return set; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java deleted file mode 100644 index 414ee40d1357..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestFileWriterFactory; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.iceberg.util.StructLikeSet; - -public class TestFlinkFileWriterFactory extends TestFileWriterFactory { - - @Override - protected FileWriterFactory newWriterFactory( - Schema dataSchema, - List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { - return FlinkFileWriterFactory.builderFor(table) - .dataSchema(table.schema()) - .dataFileFormat(format()) - .deleteFileFormat(format()) - .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) - .equalityDeleteRowSchema(equalityDeleteRowSchema) - .positionDeleteRowSchema(positionDeleteRowSchema) - .build(); - } - - @Override - protected RowData toRow(Integer id, String data) { - return SimpleDataUtil.createRowData(id, data); - } - - @Override - protected StructLikeSet toSet(Iterable rows) { - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - RowType flinkType = FlinkSchemaUtil.convert(table.schema()); - for (RowData row : rows) { - RowDataWrapper wrapper = new RowDataWrapper(flinkType, table.schema().asStruct()); - set.add(wrapper.wrap(row)); - } - return set; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java deleted file mode 100644 index fc1236ed8855..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.util.List; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.Row; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestFlinkIcebergSink extends TestFlinkIcebergSinkBase { - @Parameter(index = 0) - private FileFormat format; - - @Parameter(index = 1) - private int parallelism; - - @Parameter(index = 2) - private boolean partitioned; - - @Parameter(index = 3) - private boolean isTableSchema; - - @Parameters(name = "format={0}, parallelism = {1}, partitioned = {2}, isTableSchema = {3}") - public static Object[][] parameters() { - return new Object[][] { - // Remove after the deprecation of TableSchema - BEGIN - {FileFormat.AVRO, 1, true, true}, - {FileFormat.AVRO, 1, false, true}, - {FileFormat.AVRO, 2, true, true}, - {FileFormat.AVRO, 2, false, true}, - {FileFormat.ORC, 1, true, true}, - {FileFormat.ORC, 1, false, true}, - {FileFormat.ORC, 2, true, true}, - {FileFormat.ORC, 2, false, true}, - {FileFormat.PARQUET, 1, true, true}, - {FileFormat.PARQUET, 1, false, true}, - {FileFormat.PARQUET, 2, true, true}, - {FileFormat.PARQUET, 2, false, true}, - // Remove after the deprecation of TableSchema - END - - {FileFormat.AVRO, 1, true, false}, - {FileFormat.AVRO, 1, false, false}, - {FileFormat.AVRO, 2, true, false}, - {FileFormat.AVRO, 2, false, false}, - {FileFormat.ORC, 1, true, false}, - {FileFormat.ORC, 1, false, false}, - {FileFormat.ORC, 2, true, false}, - {FileFormat.ORC, 2, false, false}, - {FileFormat.PARQUET, 1, true, false}, - {FileFormat.PARQUET, 1, false, false}, - {FileFormat.PARQUET, 2, true, false}, - {FileFormat.PARQUET, 2, false, false}, - }; - } - - @BeforeEach - public void before() throws IOException { - this.table = - CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - partitioned - ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() - : PartitionSpec.unpartitioned(), - ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); - - this.env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); - - this.tableLoader = CATALOG_EXTENSION.tableLoader(); - } - - @TestTemplate - public void testWriteRowData() throws Exception { - List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); - DataStream dataStream = - env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) - .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); - - FlinkSink.forRowData(dataStream) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .append(); - - // Execute the program. - env.execute("Test Iceberg DataStream"); - - // Assert the iceberg table's records. - SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); - } - - @TestTemplate - public void testWriteRow() throws Exception { - testWriteRow(parallelism, null, DistributionMode.NONE, isTableSchema); - } - - @TestTemplate - public void testWriteRowWithFlinkSchema() throws Exception { - testWriteRow(parallelism, SimpleDataUtil.FLINK_SCHEMA, DistributionMode.NONE, isTableSchema); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java deleted file mode 100644 index fe37aa9e56eb..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.TestFixtures.DATABASE; - -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.java.typeutils.RowTypeInfo; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.util.DataFormatConverters; -import org.apache.flink.table.runtime.typeutils.ExternalTypeInfo; -import org.apache.flink.table.types.DataType; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.BoundedTestSource; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.extension.RegisterExtension; - -public class TestFlinkIcebergSinkBase { - - @RegisterExtension - public static MiniClusterExtension miniClusterResource = - MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); - - @RegisterExtension - protected static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); - - protected static final TypeInformation ROW_TYPE_INFO = - new RowTypeInfo( - SimpleDataUtil.FLINK_SCHEMA.getColumnDataTypes().stream() - .map(ExternalTypeInfo::of) - .toArray(TypeInformation[]::new)); - - protected static final DataFormatConverters.RowConverter CONVERTER = - new DataFormatConverters.RowConverter( - SimpleDataUtil.FLINK_SCHEMA.getColumnDataTypes().toArray(DataType[]::new)); - - protected TableLoader tableLoader; - protected Table table; - protected StreamExecutionEnvironment env; - - protected BoundedTestSource createBoundedSource(List rows) { - return new BoundedTestSource<>(Collections.singletonList(rows)); - } - - protected List createRows(String prefix) { - return Lists.newArrayList( - Row.of(1, prefix + "aaa"), - Row.of(1, prefix + "bbb"), - Row.of(1, prefix + "ccc"), - Row.of(2, prefix + "aaa"), - Row.of(2, prefix + "bbb"), - Row.of(2, prefix + "ccc"), - Row.of(3, prefix + "aaa"), - Row.of(3, prefix + "bbb"), - Row.of(3, prefix + "ccc")); - } - - protected List convertToRowData(List rows) { - return rows.stream().map(CONVERTER::toInternal).collect(Collectors.toList()); - } - - protected void testWriteRow( - int writerParallelism, - ResolvedSchema resolvedSchema, - DistributionMode distributionMode, - boolean isTableSchema) - throws Exception { - List rows = createRows(""); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - - if (isTableSchema) { - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .tableSchema( - resolvedSchema != null ? TableSchema.fromResolvedSchema(resolvedSchema) : null) - .writeParallelism(writerParallelism) - .distributionMode(distributionMode) - .append(); - } else { - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .resolvedSchema(resolvedSchema) - .writeParallelism(writerParallelism) - .distributionMode(distributionMode) - .append(); - } - - // Execute the program. - env.execute("Test Iceberg DataStream."); - - SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); - } - - protected int partitionFiles(String partition) throws IOException { - return SimpleDataUtil.partitionDataFiles(table, ImmutableMap.of("data", partition)).size(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java deleted file mode 100644 index fefd88cf57fe..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.TestFixtures.DATABASE; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.util.List; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.types.Row; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.extension.RegisterExtension; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestFlinkIcebergSinkBranch extends TestFlinkIcebergSinkBase { - @RegisterExtension - public static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); - - @Parameter(index = 0) - private String formatVersion; - - @Parameter(index = 1) - private String branch; - - @Parameter(index = 2) - private boolean isTableSchema; - - private TableLoader tableLoader; - - @Parameters(name = "formatVersion = {0}, branch = {1}, isTableSchema = {2}") - public static Object[][] parameters() { - return new Object[][] { - // Remove after the deprecation of TableSchema - BEGIN - {"1", "main", true}, - {"1", "testBranch", true}, - {"2", "main", true}, - {"2", "testBranch", true}, - // Remove after the deprecation of TableSchema - END - - {"1", "main", false}, - {"1", "testBranch", false}, - {"2", "main", false}, - {"2", "testBranch", false}, - }; - } - - @BeforeEach - public void before() throws IOException { - table = - CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - PartitionSpec.unpartitioned(), - ImmutableMap.of( - TableProperties.DEFAULT_FILE_FORMAT, - FileFormat.AVRO.name(), - TableProperties.FORMAT_VERSION, - formatVersion)); - - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100); - - tableLoader = CATALOG_EXTENSION.tableLoader(); - } - - @TestTemplate - public void testWriteRowWithFlinkSchema() throws Exception { - testWriteRow(SimpleDataUtil.FLINK_SCHEMA, DistributionMode.NONE); - verifyOtherBranchUnmodified(); - } - - private void testWriteRow(ResolvedSchema resolvedSchema, DistributionMode distributionMode) - throws Exception { - List rows = createRows(""); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - - if (isTableSchema) { - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .tableSchema(TableSchema.fromResolvedSchema(resolvedSchema)) - .toBranch(branch) - .distributionMode(distributionMode) - .append(); - } else { - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .resolvedSchema(resolvedSchema) - .toBranch(branch) - .distributionMode(distributionMode) - .append(); - } - - // Execute the program. - env.execute("Test Iceberg DataStream."); - - SimpleDataUtil.assertTableRows(table, convertToRowData(rows), branch); - SimpleDataUtil.assertTableRows( - table, - ImmutableList.of(), - branch.equals(SnapshotRef.MAIN_BRANCH) ? "test-branch" : SnapshotRef.MAIN_BRANCH); - - verifyOtherBranchUnmodified(); - } - - private void verifyOtherBranchUnmodified() { - String otherBranch = - branch.equals(SnapshotRef.MAIN_BRANCH) ? "test-branch" : SnapshotRef.MAIN_BRANCH; - if (otherBranch.equals(SnapshotRef.MAIN_BRANCH)) { - assertThat(table.currentSnapshot()).isNull(); - } - - assertThat(table.snapshot(otherBranch)).isNull(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java deleted file mode 100644 index 04bc5da6a9be..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java +++ /dev/null @@ -1,602 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.FlinkWriteOptions; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.sink.shuffle.StatisticsType; -import org.apache.iceberg.flink.source.BoundedTestSource; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Conversions; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.extension.RegisterExtension; - -/** - * This tests the distribution mode of Flink sink. Extract them separately since it is unnecessary - * to test different file formats (Avro, Orc, Parquet) like in {@link TestFlinkIcebergSink}. - * Removing the file format dimension reduces the number of combinations from 12 to 4, which helps - * reduce test run time. - */ -@ExtendWith(ParameterizedTestExtension.class) -public class TestFlinkIcebergSinkDistributionMode extends TestFlinkIcebergSinkBase { - - @RegisterExtension - public static MiniClusterExtension miniClusterResource = - MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); - - @RegisterExtension - private static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); - - private final FileFormat format = FileFormat.PARQUET; - - @Parameter(index = 0) - private int parallelism; - - @Parameter(index = 1) - private boolean partitioned; - - @Parameter(index = 2) - private int writeParallelism; - - @Parameter(index = 3) - private boolean isTableSchema; - - @Parameters( - name = "parallelism = {0}, partitioned = {1}, writeParallelism = {2}, isTableSchema = {3}") - public static Object[][] parameters() { - return new Object[][] { - // Remove after the deprecation of TableSchema - BEGIN - {1, true, 1, true}, - {1, false, 1, true}, - {2, true, 2, true}, - {2, false, 2, true}, - {1, true, 2, true}, - {1, false, 2, true}, - // Remove after the deprecation of TableSchema - END - - {1, true, 1, false}, - {1, false, 1, false}, - {2, true, 2, false}, - {2, false, 2, false}, - {1, true, 2, false}, - {1, false, 2, false}, - }; - } - - @BeforeEach - public void before() throws IOException { - this.table = - CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - partitioned - ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() - : PartitionSpec.unpartitioned(), - ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); - - this.env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(Math.max(parallelism, writeParallelism)); - - this.tableLoader = CATALOG_EXTENSION.tableLoader(); - } - - @TestTemplate - public void testShuffleByPartitionWithSchema() throws Exception { - testWriteRow(parallelism, SimpleDataUtil.FLINK_SCHEMA, DistributionMode.HASH, isTableSchema); - if (partitioned) { - assertThat(partitionFiles("aaa")).isEqualTo(1); - assertThat(partitionFiles("bbb")).isEqualTo(1); - assertThat(partitionFiles("ccc")).isEqualTo(1); - } - } - - @TestTemplate - public void testJobNoneDistributeMode() throws Exception { - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) - .commit(); - - testWriteRow(parallelism, null, DistributionMode.NONE, isTableSchema); - - if (parallelism > 1) { - if (partitioned) { - int files = partitionFiles("aaa") + partitionFiles("bbb") + partitionFiles("ccc"); - assertThat(files).isGreaterThan(3); - } - } - } - - @TestTemplate - public void testJobNullDistributionMode() throws Exception { - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) - .commit(); - - testWriteRow(parallelism, null, null, isTableSchema); - - if (partitioned) { - assertThat(partitionFiles("aaa")).isEqualTo(1); - assertThat(partitionFiles("bbb")).isEqualTo(1); - assertThat(partitionFiles("ccc")).isEqualTo(1); - } - } - - @TestTemplate - public void testPartitionWriteMode() throws Exception { - testWriteRow(parallelism, null, DistributionMode.HASH, isTableSchema); - if (partitioned) { - assertThat(partitionFiles("aaa")).isEqualTo(1); - assertThat(partitionFiles("bbb")).isEqualTo(1); - assertThat(partitionFiles("ccc")).isEqualTo(1); - } - } - - @TestTemplate - public void testOverrideWriteConfigWithUnknownDistributionMode() { - Map newProps = Maps.newHashMap(); - newProps.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), "UNRECOGNIZED"); - - List rows = createRows(""); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - - FlinkSink.Builder builder = - isTableSchema - ? FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .setAll(newProps) - : FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .setAll(newProps); - - assertThatThrownBy(builder::append) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Invalid distribution mode: UNRECOGNIZED"); - } - - @TestTemplate - public void testRangeDistributionWithoutSortOrderUnpartitioned() { - assumeThat(partitioned).isFalse(); - - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) - .commit(); - - int numOfCheckpoints = 6; - DataStream dataStream = - env.addSource( - createRangeDistributionBoundedSource(createCharRows(numOfCheckpoints, 10)), - ROW_TYPE_INFO); - FlinkSink.Builder builder = - isTableSchema - ? FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - : FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism); - - // Range distribution requires either sort order or partition spec defined - assertThatThrownBy(builder::append) - .isInstanceOf(IllegalStateException.class) - .hasMessage( - "Invalid write distribution mode: range. Need to define sort order or partition spec."); - } - - @TestTemplate - public void testRangeDistributionWithoutSortOrderPartitioned() throws Exception { - assumeThat(partitioned).isTrue(); - - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) - .commit(); - - int numOfCheckpoints = 6; - DataStream dataStream = - env.addSource( - createRangeDistributionBoundedSource(createCharRows(numOfCheckpoints, 10)), - ROW_TYPE_INFO); - FlinkSink.Builder builder = - isTableSchema - ? FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - : FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism); - - // sort based on partition columns - builder.append(); - env.execute(getClass().getSimpleName()); - - table.refresh(); - // ordered in reverse timeline from the newest snapshot to the oldest snapshot - List snapshots = Lists.newArrayList(table.snapshots().iterator()); - // only keep the snapshots with added data files - snapshots = - snapshots.stream() - .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) - .collect(Collectors.toList()); - - // Sometimes we will have more checkpoints than the bounded source if we pass the - // auto checkpoint interval. Thus producing multiple snapshots. - assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); - } - - @TestTemplate - public void testRangeDistributionWithNullValue() throws Exception { - assumeThat(partitioned).isTrue(); - - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) - .commit(); - - int numOfCheckpoints = 6; - List> charRows = createCharRows(numOfCheckpoints, 10); - charRows.add(ImmutableList.of(Row.of(1, null))); - DataStream dataStream = - env.addSource(createRangeDistributionBoundedSource(charRows), ROW_TYPE_INFO); - FlinkSink.Builder builder = - isTableSchema - ? FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - : FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism); - - // sort based on partition columns - builder.append(); - env.execute(getClass().getSimpleName()); - - table.refresh(); - // ordered in reverse timeline from the newest snapshot to the oldest snapshot - List snapshots = Lists.newArrayList(table.snapshots().iterator()); - // only keep the snapshots with added data files - snapshots = - snapshots.stream() - .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) - .collect(Collectors.toList()); - - // Sometimes we will have more checkpoints than the bounded source if we pass the - // auto checkpoint interval. Thus producing multiple snapshots. - assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); - } - - @TestTemplate - public void testRangeDistributionWithSortOrder() throws Exception { - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) - .commit(); - table.replaceSortOrder().asc("data").commit(); - - int numOfCheckpoints = 6; - DataStream dataStream = - env.addSource( - createRangeDistributionBoundedSource(createCharRows(numOfCheckpoints, 10)), - ROW_TYPE_INFO); - if (isTableSchema) { - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .rangeDistributionStatisticsType(StatisticsType.Map) - .append(); - } else { - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .rangeDistributionStatisticsType(StatisticsType.Map) - .append(); - } - - env.execute(getClass().getSimpleName()); - - table.refresh(); - // ordered in reverse timeline from the newest snapshot to the oldest snapshot - List snapshots = Lists.newArrayList(table.snapshots().iterator()); - // only keep the snapshots with added data files - snapshots = - snapshots.stream() - .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) - .collect(Collectors.toList()); - - // Sometimes we will have more checkpoints than the bounded source if we pass the - // auto checkpoint interval. Thus producing multiple snapshots. - assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); - - // It takes 2 checkpoint cycle for statistics collection and application - // of the globally aggregated statistics in the range partitioner. - // The last two checkpoints should have range shuffle applied - List rangePartitionedCycles = - snapshots.subList(snapshots.size() - 2, snapshots.size()); - - if (partitioned) { - for (Snapshot snapshot : rangePartitionedCycles) { - List addedDataFiles = - Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); - // up to 26 partitions - assertThat(addedDataFiles).hasSizeLessThanOrEqualTo(26); - } - } else { - for (Snapshot snapshot : rangePartitionedCycles) { - List addedDataFiles = - Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); - // each writer task should only write one file for non-partition sort column - assertThat(addedDataFiles).hasSize(writeParallelism); - // verify there is no overlap in min-max stats range - if (parallelism > 1) { - assertIdColumnStatsNoRangeOverlap(addedDataFiles.get(0), addedDataFiles.get(1)); - } - } - } - } - - @TestTemplate - public void testRangeDistributionSketchWithSortOrder() throws Exception { - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) - .commit(); - table.replaceSortOrder().asc("id").commit(); - - int numOfCheckpoints = 6; - DataStream dataStream = - env.addSource( - createRangeDistributionBoundedSource(createIntRows(numOfCheckpoints, 1_000)), - ROW_TYPE_INFO); - if (isTableSchema) { - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .rangeDistributionStatisticsType(StatisticsType.Sketch) - .append(); - } else { - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .rangeDistributionStatisticsType(StatisticsType.Sketch) - .append(); - } - - env.execute(getClass().getSimpleName()); - - table.refresh(); - // ordered in reverse timeline from the newest snapshot to the oldest snapshot - List snapshots = Lists.newArrayList(table.snapshots().iterator()); - // only keep the snapshots with added data files - snapshots = - snapshots.stream() - .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) - .collect(Collectors.toList()); - - // Sometimes we will have more checkpoints than the bounded source if we pass the - // auto checkpoint interval. Thus producing multiple snapshots. - assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); - - // It takes 2 checkpoint cycle for statistics collection and application - // of the globally aggregated statistics in the range partitioner. - // The last two checkpoints should have range shuffle applied - List rangePartitionedCycles = - snapshots.subList(snapshots.size() - 2, snapshots.size()); - - // since the input has a single value for the data column, - // it is always the same partition. Hence there is no difference - // for partitioned or not - for (Snapshot snapshot : rangePartitionedCycles) { - List addedDataFiles = - Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); - // each writer task should only write one file for non-partition sort column - assertThat(addedDataFiles).hasSize(writeParallelism); - // verify there is no overlap in min-max stats range - if (writeParallelism > 2) { - assertIdColumnStatsNoRangeOverlap(addedDataFiles.get(0), addedDataFiles.get(1)); - } - } - } - - /** Test migration from Map stats to Sketch stats */ - @TestTemplate - public void testRangeDistributionStatisticsMigration() throws Exception { - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) - .commit(); - table.replaceSortOrder().asc("id").commit(); - - int numOfCheckpoints = 6; - List> rowsPerCheckpoint = Lists.newArrayListWithCapacity(numOfCheckpoints); - for (int checkpointId = 0; checkpointId < numOfCheckpoints; ++checkpointId) { - // checkpointId 2 would emit 11_000 records which is larger than - // the OPERATOR_SKETCH_SWITCH_THRESHOLD of 10_000. - // This should trigger the stats migration. - int maxId = checkpointId < 2 ? 1_000 : 11_000; - List rows = Lists.newArrayListWithCapacity(maxId); - for (int j = 0; j < maxId; ++j) { - // fixed value "a" for the data (possible partition column) - rows.add(Row.of(j, "a")); - } - - rowsPerCheckpoint.add(rows); - } - - DataStream dataStream = - env.addSource(createRangeDistributionBoundedSource(rowsPerCheckpoint), ROW_TYPE_INFO); - if (isTableSchema) { - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .rangeDistributionStatisticsType(StatisticsType.Auto) - .append(); - } else { - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .rangeDistributionStatisticsType(StatisticsType.Auto) - .append(); - } - - env.execute(getClass().getSimpleName()); - - table.refresh(); - // ordered in reverse timeline from the newest snapshot to the oldest snapshot - List snapshots = Lists.newArrayList(table.snapshots().iterator()); - // only keep the snapshots with added data files - snapshots = - snapshots.stream() - .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) - .collect(Collectors.toList()); - - // Sometimes we will have more checkpoints than the bounded source if we pass the - // auto checkpoint interval. Thus producing multiple snapshots. - assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); - - // It takes 2 checkpoint cycle for statistics collection and application - // of the globally aggregated statistics in the range partitioner. - // The last two checkpoints should have range shuffle applied - List rangePartitionedCycles = - snapshots.subList(snapshots.size() - 2, snapshots.size()); - - // since the input has a single value for the data column, - // it is always the same partition. Hence there is no difference - // for partitioned or not - for (Snapshot snapshot : rangePartitionedCycles) { - List addedDataFiles = - Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); - // each writer task should only write one file for non-partition sort column - // sometimes - assertThat(addedDataFiles).hasSize(writeParallelism); - // verify there is no overlap in min-max stats range - if (writeParallelism > 1) { - assertIdColumnStatsNoRangeOverlap(addedDataFiles.get(0), addedDataFiles.get(1)); - } - } - } - - private BoundedTestSource createRangeDistributionBoundedSource( - List> rowsPerCheckpoint) { - return new BoundedTestSource<>(rowsPerCheckpoint); - } - - private List> createCharRows(int numOfCheckpoints, int countPerChar) { - List> rowsPerCheckpoint = Lists.newArrayListWithCapacity(numOfCheckpoints); - for (int checkpointId = 0; checkpointId < numOfCheckpoints; ++checkpointId) { - List rows = Lists.newArrayListWithCapacity(26 * countPerChar); - for (int j = 0; j < countPerChar; ++j) { - for (char c = 'a'; c <= 'z'; ++c) { - rows.add(Row.of(1, String.valueOf(c))); - } - } - - rowsPerCheckpoint.add(rows); - } - - return rowsPerCheckpoint; - } - - private List> createIntRows(int numOfCheckpoints, int maxId) { - List> rowsPerCheckpoint = Lists.newArrayListWithCapacity(numOfCheckpoints); - for (int checkpointId = 0; checkpointId < numOfCheckpoints; ++checkpointId) { - List rows = Lists.newArrayListWithCapacity(maxId); - for (int j = 0; j < maxId; ++j) { - // fixed value "a" for the data (possible partition column) - rows.add(Row.of(j, "a")); - } - - rowsPerCheckpoint.add(rows); - } - - return rowsPerCheckpoint; - } - - private void assertIdColumnStatsNoRangeOverlap(DataFile file1, DataFile file2) { - // id column has fieldId 1 - int file1LowerBound = - Conversions.fromByteBuffer(Types.IntegerType.get(), file1.lowerBounds().get(1)); - int file1UpperBound = - Conversions.fromByteBuffer(Types.IntegerType.get(), file1.upperBounds().get(1)); - int file2LowerBound = - Conversions.fromByteBuffer(Types.IntegerType.get(), file2.lowerBounds().get(1)); - int file2UpperBound = - Conversions.fromByteBuffer(Types.IntegerType.get(), file2.upperBounds().get(1)); - - if (file1LowerBound < file2LowerBound) { - assertThat(file1UpperBound).isLessThanOrEqualTo(file2LowerBound); - } else { - assertThat(file2UpperBound).isLessThanOrEqualTo(file1LowerBound); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java deleted file mode 100644 index 018b877a0115..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java +++ /dev/null @@ -1,250 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.Row; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.FlinkWriteOptions; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; - -/** - * This class tests the more extended features of Flink sink. Extract them separately since it is - * unnecessary to test all the parameters combinations in {@link TestFlinkIcebergSink}. Each test - * method in {@link TestFlinkIcebergSink} runs 12 combinations, which are expensive and slow. - */ -@ExtendWith(ParameterizedTestExtension.class) -public class TestFlinkIcebergSinkExtended extends TestFlinkIcebergSinkBase { - private final boolean partitioned = true; - private final int parallelism = 2; - private final FileFormat format = FileFormat.PARQUET; - - @Parameter private boolean isTableSchema; - - @Parameters(name = "isTableSchema={0}") - private static Object[][] parameters() { - return new Object[][] {{true}, {false}}; - } - - @BeforeEach - public void before() throws IOException { - this.table = - CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - partitioned - ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() - : PartitionSpec.unpartitioned(), - ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); - - this.env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); - - this.tableLoader = CATALOG_EXTENSION.tableLoader(); - } - - @TestTemplate - public void testTwoSinksInDisjointedDAG() throws Exception { - Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - - Table leftTable = - CATALOG_EXTENSION - .catalog() - .createTable( - TableIdentifier.of("left"), - SimpleDataUtil.SCHEMA, - partitioned - ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() - : PartitionSpec.unpartitioned(), - props); - TableLoader leftTableLoader = - TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), TableIdentifier.of("left")); - - Table rightTable = - CATALOG_EXTENSION - .catalog() - .createTable( - TableIdentifier.of("right"), - SimpleDataUtil.SCHEMA, - partitioned - ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() - : PartitionSpec.unpartitioned(), - props); - TableLoader rightTableLoader = - TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), TableIdentifier.of("right")); - - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); - env.getConfig().disableAutoGeneratedUIDs(); - - List leftRows = createRows("left-"); - DataStream leftStream = - env.addSource(createBoundedSource(leftRows), ROW_TYPE_INFO) - .name("leftCustomSource") - .uid("leftCustomSource"); - if (isTableSchema) { - FlinkSink.forRow(leftStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(leftTable) - .tableLoader(leftTableLoader) - .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) - .distributionMode(DistributionMode.NONE) - .uidPrefix("leftIcebergSink") - .append(); - } else { - FlinkSink.forRow(leftStream, SimpleDataUtil.FLINK_SCHEMA) - .table(leftTable) - .tableLoader(leftTableLoader) - .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) - .distributionMode(DistributionMode.NONE) - .uidPrefix("leftIcebergSink") - .append(); - } - - List rightRows = createRows("right-"); - DataStream rightStream = - env.addSource(createBoundedSource(rightRows), ROW_TYPE_INFO) - .name("rightCustomSource") - .uid("rightCustomSource"); - if (isTableSchema) { - FlinkSink.forRow(rightStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(rightTable) - .tableLoader(rightTableLoader) - .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) - .writeParallelism(parallelism) - .distributionMode(DistributionMode.HASH) - .uidPrefix("rightIcebergSink") - .setSnapshotProperty("flink.test", TestFlinkIcebergSink.class.getName()) - .setSnapshotProperties(Collections.singletonMap("direction", "rightTable")) - .append(); - } else { - FlinkSink.forRow(rightStream, SimpleDataUtil.FLINK_SCHEMA) - .table(rightTable) - .tableLoader(rightTableLoader) - .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) - .writeParallelism(parallelism) - .distributionMode(DistributionMode.HASH) - .uidPrefix("rightIcebergSink") - .setSnapshotProperty("flink.test", TestFlinkIcebergSink.class.getName()) - .setSnapshotProperties(Collections.singletonMap("direction", "rightTable")) - .append(); - } - - // Execute the program. - env.execute("Test Iceberg DataStream."); - - SimpleDataUtil.assertTableRows(leftTable, convertToRowData(leftRows)); - SimpleDataUtil.assertTableRows(rightTable, convertToRowData(rightRows)); - - leftTable.refresh(); - assertThat(leftTable.currentSnapshot().summary()).doesNotContainKeys("flink.test", "direction"); - rightTable.refresh(); - assertThat(rightTable.currentSnapshot().summary()) - .containsEntry("flink.test", TestFlinkIcebergSink.class.getName()) - .containsEntry("direction", "rightTable"); - } - - @TestTemplate - public void testOverrideWriteConfigWithUnknownFileFormat() { - Map newProps = Maps.newHashMap(); - newProps.put(FlinkWriteOptions.WRITE_FORMAT.key(), "UNRECOGNIZED"); - - List rows = createRows(""); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - - FlinkSink.Builder builder = - isTableSchema - ? FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .setAll(newProps) - : FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .setAll(newProps); - - assertThatThrownBy(builder::append) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Invalid file format: UNRECOGNIZED"); - } - - @Test - public void testWriteRowWithTableRefreshInterval() throws Exception { - List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); - DataStream dataStream = - env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) - .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); - - Configuration flinkConf = new Configuration(); - flinkConf.setString(FlinkWriteOptions.TABLE_REFRESH_INTERVAL.key(), "100ms"); - - FlinkSink.forRowData(dataStream) - .table(table) - .tableLoader(tableLoader) - .flinkConf(flinkConf) - .writeParallelism(parallelism) - .append(); - - // Execute the program. - env.execute("Test Iceberg DataStream"); - - // Assert the iceberg table's records. - SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkRangeDistributionBucketing.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkRangeDistributionBucketing.java deleted file mode 100644 index a5799288b5e3..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkRangeDistributionBucketing.java +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.expressions.Expressions.bucket; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.List; -import java.util.UUID; -import java.util.stream.Collectors; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.connector.datagen.source.DataGeneratorSource; -import org.apache.flink.connector.datagen.source.GeneratorFunction; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.Timeout; -import org.junit.jupiter.api.extension.RegisterExtension; - -/** - * Test range distribution with bucketing partition column. Compared to hash distribution, range - * distribution is more general to handle bucketing column while achieving even distribution of - * traffic to writer tasks. - * - *
      - *
    • keyBy on low cardinality (e.g. - * 60) may not achieve balanced data distribution. - *
    • number of buckets (e.g. 60) is not divisible by the writer parallelism (e.g. 40). - *
    • number of buckets (e.g. 60) is smaller than the writer parallelism (e.g. 120). - *
    - */ -@Timeout(value = 30) -@Disabled // https://github.com/apache/iceberg/pull/11305#issuecomment-2415207097 -public class TestFlinkIcebergSinkRangeDistributionBucketing { - private static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = - new Configuration() - // disable classloader check as Avro may cache class/object in the serializers. - .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - - // max supported parallelism is 16 (= 4 x 4) - @RegisterExtension - public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = - new MiniClusterExtension( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(4) - .setNumberSlotsPerTaskManager(4) - .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) - .build()); - - @RegisterExtension - private static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); - - private static final int NUM_BUCKETS = 4; - private static final int NUM_OF_CHECKPOINTS = 6; - private static final int ROW_COUNT_PER_CHECKPOINT = 200; - private static final Schema SCHEMA = - new Schema( - Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(2, "uuid", Types.UUIDType.get()), - Types.NestedField.optional(3, "data", Types.StringType.get())); - private static final PartitionSpec SPEC = - PartitionSpec.builderFor(SCHEMA).hour("ts").bucket("uuid", NUM_BUCKETS).build(); - private static final RowType ROW_TYPE = FlinkSchemaUtil.convert(SCHEMA); - - private TableLoader tableLoader; - private Table table; - - @BeforeEach - public void before() throws IOException { - this.tableLoader = CATALOG_EXTENSION.tableLoader(); - this.table = - CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SCHEMA, - SPEC, - ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, FileFormat.PARQUET.name())); - - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) - .commit(); - - // Assuming ts is on ingestion/processing time. Writer only writes to 1 or 2 hours concurrently. - // Only sort on the bucket column to avoid each writer task writes to 60 buckets/files - // concurrently. - table.replaceSortOrder().asc(bucket("uuid", NUM_BUCKETS)).commit(); - } - - @AfterEach - public void after() throws Exception { - CATALOG_EXTENSION.catalog().dropTable(TestFixtures.TABLE_IDENTIFIER); - } - - /** number of buckets 4 matches writer parallelism of 4 */ - @Test - public void testBucketNumberEqualsToWriterParallelism() throws Exception { - testParallelism(4); - } - - /** number of buckets 4 is less than writer parallelism of 6 */ - @Test - public void testBucketNumberLessThanWriterParallelismNotDivisible() throws Exception { - testParallelism(6); - } - - /** number of buckets 4 is less than writer parallelism of 8 */ - @Test - public void testBucketNumberLessThanWriterParallelismDivisible() throws Exception { - testParallelism(8); - } - - /** number of buckets 4 is greater than writer parallelism of 3 */ - @Test - public void testBucketNumberHigherThanWriterParallelismNotDivisible() throws Exception { - testParallelism(3); - } - - /** number of buckets 4 is greater than writer parallelism of 2 */ - @Test - public void testBucketNumberHigherThanWriterParallelismDivisible() throws Exception { - testParallelism(2); - } - - private void testParallelism(int parallelism) throws Exception { - try (StreamExecutionEnvironment env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(parallelism)) { - - DataGeneratorSource generatorSource = - new DataGeneratorSource<>( - new RowGenerator(), - ROW_COUNT_PER_CHECKPOINT * NUM_OF_CHECKPOINTS, - RateLimiterStrategy.perCheckpoint(ROW_COUNT_PER_CHECKPOINT), - FlinkCompatibilityUtil.toTypeInfo(ROW_TYPE)); - DataStream dataStream = - env.fromSource(generatorSource, WatermarkStrategy.noWatermarks(), "Data Generator"); - - FlinkSink.forRowData(dataStream) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .append(); - env.execute(getClass().getSimpleName()); - - table.refresh(); - // ordered in reverse timeline from the oldest snapshot to the newest snapshot - List snapshots = Lists.newArrayList(table.snapshots().iterator()); - // only keep the snapshots with added data files - snapshots = - snapshots.stream() - .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) - .collect(Collectors.toList()); - - // Source rate limit per checkpoint cycle may not be super precise. - // There could be more checkpoint cycles and commits than planned. - assertThat(snapshots).hasSizeGreaterThanOrEqualTo(NUM_OF_CHECKPOINTS); - - // It takes 2 checkpoint cycle for statistics collection and application - // of the globally aggregated statistics in the range partitioner. - // The last two checkpoints should have range shuffle applied - List rangePartitionedCycles = - snapshots.subList(snapshots.size() - 2, snapshots.size()); - - for (Snapshot snapshot : rangePartitionedCycles) { - List addedDataFiles = - Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); - assertThat(addedDataFiles) - .hasSizeLessThanOrEqualTo(maxAddedDataFilesPerCheckpoint(parallelism)); - } - } - } - - /** - * Traffic is not perfectly balanced across all buckets in the small sample size Range - * distribution of the bucket id may cross subtask boundary. Hence the number of committed data - * files per checkpoint maybe larger than writer parallelism or the number of buckets. But it - * should not be more than the sum of those two. Without range distribution, the number of data - * files per commit can be 4x of parallelism (as the number of buckets is 4). - */ - private int maxAddedDataFilesPerCheckpoint(int parallelism) { - return NUM_BUCKETS + parallelism; - } - - private static class RowGenerator implements GeneratorFunction { - // use constant timestamp so that all rows go to the same hourly partition - private final long ts = System.currentTimeMillis(); - - @Override - public RowData map(Long index) throws Exception { - // random uuid should result in relatively balanced distribution across buckets - UUID uuid = UUID.randomUUID(); - ByteBuffer uuidByteBuffer = ByteBuffer.allocate(16); - uuidByteBuffer.putLong(uuid.getMostSignificantBits()); - uuidByteBuffer.putLong(uuid.getLeastSignificantBits()); - return GenericRowData.of( - TimestampData.fromEpochMillis(ts), - uuidByteBuffer.array(), - StringData.fromString("row-" + index)); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java deleted file mode 100644 index ffd40b6cdc95..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java +++ /dev/null @@ -1,254 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.TestFixtures.DATABASE; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.util.List; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.BoundedTestSource; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.Timeout; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.extension.RegisterExtension; - -@ExtendWith(ParameterizedTestExtension.class) -@Timeout(value = 60) -public class TestFlinkIcebergSinkV2 extends TestFlinkIcebergSinkV2Base { - @RegisterExtension - public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = - MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); - - @RegisterExtension - private static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); - - @BeforeEach - public void setupTable() { - table = - CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - partitioned - ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() - : PartitionSpec.unpartitioned(), - ImmutableMap.of( - TableProperties.DEFAULT_FILE_FORMAT, - format.name(), - TableProperties.FORMAT_VERSION, - String.valueOf(FORMAT_V2))); - - table - .updateProperties() - .set(TableProperties.DEFAULT_FILE_FORMAT, format.name()) - .set(TableProperties.WRITE_DISTRIBUTION_MODE, writeDistributionMode) - .commit(); - - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100L) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); - - tableLoader = CATALOG_EXTENSION.tableLoader(); - } - - @TestTemplate - public void testCheckAndGetEqualityFieldIds() { - table - .updateSchema() - .allowIncompatibleChanges() - .addRequiredColumn("type", Types.StringType.get()) - .setIdentifierFields("type") - .commit(); - - DataStream dataStream = - env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); - FlinkSink.Builder builder = - isTableSchema - ? FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA).table(table) - : FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA).table(table); - - // Use schema identifier field IDs as equality field id list by default - assertThat(builder.checkAndGetEqualityFieldIds()) - .containsExactlyInAnyOrderElementsOf(table.schema().identifierFieldIds()); - - // Use user-provided equality field column as equality field id list - builder.equalityFieldColumns(Lists.newArrayList("id")); - assertThat(builder.checkAndGetEqualityFieldIds()) - .containsExactlyInAnyOrder(table.schema().findField("id").fieldId()); - - builder.equalityFieldColumns(Lists.newArrayList("type")); - assertThat(builder.checkAndGetEqualityFieldIds()) - .containsExactlyInAnyOrder(table.schema().findField("type").fieldId()); - } - - @TestTemplate - public void testChangeLogOnIdKey() throws Exception { - testChangeLogOnIdKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testUpsertOnlyDeletesOnDataKey() throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of(row("+I", 1, "aaa")), - ImmutableList.of(row("-D", 1, "aaa"), row("-D", 2, "bbb"))); - - List> expectedRecords = - ImmutableList.of(ImmutableList.of(record(1, "aaa")), ImmutableList.of()); - - testChangeLogs( - ImmutableList.of("data"), - row -> row.getField(ROW_DATA_POS), - true, - elementsPerCheckpoint, - expectedRecords, - SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testChangeLogOnDataKey() throws Exception { - testChangeLogOnDataKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testChangeLogOnIdDataKey() throws Exception { - testChangeLogOnIdDataKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testChangeLogOnSameKey() throws Exception { - testChangeLogOnSameKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testUpsertModeCheck() { - DataStream dataStream = - env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); - FlinkSink.Builder builder = - isTableSchema - ? FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .tableLoader(tableLoader) - .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) - .writeParallelism(parallelism) - .upsert(true) - : FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .tableLoader(tableLoader) - .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) - .writeParallelism(parallelism) - .upsert(true); - - assertThatThrownBy( - () -> - builder - .equalityFieldColumns(ImmutableList.of("id", "data")) - .overwrite(true) - .append()) - .isInstanceOf(IllegalStateException.class) - .hasMessage( - "OVERWRITE mode shouldn't be enable when configuring to use UPSERT data stream."); - - if (writeDistributionMode.equals(DistributionMode.RANGE.modeName()) && !partitioned) { - // validation error thrown from distributeDataStream - assertThatThrownBy( - () -> builder.equalityFieldColumns(ImmutableList.of()).overwrite(false).append()) - .isInstanceOf(IllegalStateException.class) - .hasMessage( - "Invalid write distribution mode: range. Need to define sort order or partition spec."); - } else { - // validation error thrown from appendWriter - assertThatThrownBy( - () -> builder.equalityFieldColumns(ImmutableList.of()).overwrite(false).append()) - .isInstanceOf(IllegalStateException.class) - .hasMessage( - "Equality field columns shouldn't be empty when configuring to use UPSERT data stream."); - } - } - - @TestTemplate - public void testUpsertOnIdKey() throws Exception { - testUpsertOnIdKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testUpsertOnDataKey() throws Exception { - testUpsertOnDataKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testUpsertOnIdDataKey() throws Exception { - testUpsertOnIdDataKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testDeleteStats() throws Exception { - assumeThat(format).isNotEqualTo(FileFormat.AVRO); - - List> elementsPerCheckpoint = - ImmutableList.of( - // Checkpoint #1 - ImmutableList.of(row("+I", 1, "aaa"), row("-D", 1, "aaa"), row("+I", 1, "aaa"))); - - List> expectedRecords = ImmutableList.of(ImmutableList.of(record(1, "aaa"))); - - testChangeLogs( - ImmutableList.of("id", "data"), - row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - false, - elementsPerCheckpoint, - expectedRecords, - "main"); - - DeleteFile deleteFile = table.currentSnapshot().addedDeleteFiles(table.io()).iterator().next(); - String fromStat = - new String( - deleteFile.lowerBounds().get(MetadataColumns.DELETE_FILE_PATH.fieldId()).array()); - DataFile dataFile = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); - assumeThat(fromStat).isEqualTo(dataFile.location()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java deleted file mode 100644 index 12a4593d039e..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java +++ /dev/null @@ -1,424 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.api.java.typeutils.RowTypeInfo; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.runtime.typeutils.ExternalTypeInfo; -import org.apache.flink.types.Row; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.data.IcebergGenerics; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.source.BoundedTestSource; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.StructLikeSet; - -class TestFlinkIcebergSinkV2Base { - - static final int FORMAT_V2 = 2; - static final TypeInformation ROW_TYPE_INFO = - new RowTypeInfo( - SimpleDataUtil.FLINK_SCHEMA.getColumnDataTypes().stream() - .map(ExternalTypeInfo::of) - .toArray(TypeInformation[]::new)); - - static final int ROW_ID_POS = 0; - static final int ROW_DATA_POS = 1; - - TableLoader tableLoader; - Table table; - StreamExecutionEnvironment env; - - @Parameter(index = 0) - FileFormat format; - - @Parameter(index = 1) - int parallelism = 1; - - @Parameter(index = 2) - boolean partitioned; - - @Parameter(index = 3) - String writeDistributionMode; - - @Parameter(index = 4) - boolean isTableSchema; - - @Parameters( - name = - "FileFormat={0}, Parallelism={1}, Partitioned={2}, WriteDistributionMode={3}, IsTableSchema={4}") - public static Object[][] parameters() { - return new Object[][] { - // Remove after the deprecation of TableSchema - BEGIN - {FileFormat.AVRO, 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, true}, - {FileFormat.AVRO, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, true}, - {FileFormat.AVRO, 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, true}, - {FileFormat.AVRO, 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, true}, - {FileFormat.ORC, 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH, true}, - {FileFormat.ORC, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH, true}, - {FileFormat.ORC, 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH, true}, - {FileFormat.ORC, 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH, true}, - {FileFormat.PARQUET, 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE, true}, - {FileFormat.PARQUET, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE, true}, - {FileFormat.PARQUET, 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE, true}, - {FileFormat.PARQUET, 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE, true}, - // Remove after the deprecation of TableSchema - END - - {FileFormat.AVRO, 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, false}, - {FileFormat.AVRO, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, false}, - {FileFormat.AVRO, 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, false}, - {FileFormat.AVRO, 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, false}, - {FileFormat.ORC, 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH, false}, - {FileFormat.ORC, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH, false}, - {FileFormat.ORC, 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH, false}, - {FileFormat.ORC, 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH, false}, - {FileFormat.PARQUET, 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE, false}, - {FileFormat.PARQUET, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE, false}, - {FileFormat.PARQUET, 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE, false}, - {FileFormat.PARQUET, 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE, false}, - }; - } - - static final Map ROW_KIND_MAP = - ImmutableMap.of( - "+I", RowKind.INSERT, - "-D", RowKind.DELETE, - "-U", RowKind.UPDATE_BEFORE, - "+U", RowKind.UPDATE_AFTER); - - Row row(String rowKind, int id, String data) { - RowKind kind = ROW_KIND_MAP.get(rowKind); - if (kind == null) { - throw new IllegalArgumentException("Unknown row kind: " + rowKind); - } - - return Row.ofKind(kind, id, data); - } - - void testUpsertOnIdDataKey(String branch) throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of(row("+I", 1, "aaa"), row("+U", 1, "aaa"), row("+I", 2, "bbb")), - ImmutableList.of(row("+I", 1, "aaa"), row("-D", 2, "bbb"), row("+I", 2, "ccc")), - ImmutableList.of(row("+U", 1, "bbb"), row("-U", 1, "ccc"), row("-D", 1, "aaa"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(1, "aaa"), record(2, "bbb")), - ImmutableList.of(record(1, "aaa"), record(2, "ccc")), - ImmutableList.of(record(1, "bbb"), record(2, "ccc"))); - testChangeLogs( - ImmutableList.of("id", "data"), - row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - true, - elementsPerCheckpoint, - expectedRecords, - branch); - } - - void testChangeLogOnIdDataKey(String branch) throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 2, "bbb"), - row("+I", 1, "bbb"), - row("+I", 2, "aaa")), - ImmutableList.of(row("-U", 2, "aaa"), row("+U", 1, "ccc"), row("+I", 1, "aaa")), - ImmutableList.of(row("-D", 1, "bbb"), row("+I", 2, "aaa"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(1, "bbb"), record(2, "aaa"), record(2, "bbb")), - ImmutableList.of( - record(1, "aaa"), record(1, "bbb"), record(1, "ccc"), record(2, "bbb")), - ImmutableList.of( - record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "bbb"))); - - testChangeLogs( - ImmutableList.of("data", "id"), - row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - false, - elementsPerCheckpoint, - expectedRecords, - branch); - } - - void testChangeLogOnSameKey(String branch) throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - // Checkpoint #1 - ImmutableList.of(row("+I", 1, "aaa"), row("-D", 1, "aaa"), row("+I", 1, "aaa")), - // Checkpoint #2 - ImmutableList.of(row("-U", 1, "aaa"), row("+U", 1, "aaa")), - // Checkpoint #3 - ImmutableList.of(row("-D", 1, "aaa"), row("+I", 1, "aaa")), - // Checkpoint #4 - ImmutableList.of(row("-U", 1, "aaa"), row("+U", 1, "aaa"), row("+I", 1, "aaa"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(1, "aaa")), - ImmutableList.of(record(1, "aaa")), - ImmutableList.of(record(1, "aaa")), - ImmutableList.of(record(1, "aaa"), record(1, "aaa"))); - - testChangeLogs( - ImmutableList.of("id", "data"), - row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - false, - elementsPerCheckpoint, - expectedRecords, - branch); - } - - void testChangeLogOnDataKey(String branch) throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 2, "bbb"), - row("+I", 1, "bbb"), - row("+I", 2, "aaa")), - ImmutableList.of(row("-U", 2, "aaa"), row("+U", 1, "ccc"), row("+I", 1, "aaa")), - ImmutableList.of(row("-D", 1, "bbb"), row("+I", 2, "aaa"), row("+I", 2, "ccc"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(1, "bbb"), record(2, "aaa")), - ImmutableList.of(record(1, "aaa"), record(1, "bbb"), record(1, "ccc")), - ImmutableList.of( - record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "ccc"))); - - testChangeLogs( - ImmutableList.of("data"), - row -> row.getField(ROW_DATA_POS), - false, - elementsPerCheckpoint, - expectedRecords, - branch); - } - - void testUpsertOnDataKey(String branch) throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of(row("+I", 1, "aaa"), row("+I", 2, "aaa"), row("+I", 3, "bbb")), - ImmutableList.of(row("+U", 4, "aaa"), row("-U", 3, "bbb"), row("+U", 5, "bbb")), - ImmutableList.of(row("+I", 6, "aaa"), row("+U", 7, "bbb"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(2, "aaa"), record(3, "bbb")), - ImmutableList.of(record(4, "aaa"), record(5, "bbb")), - ImmutableList.of(record(6, "aaa"), record(7, "bbb"))); - - testChangeLogs( - ImmutableList.of("data"), - row -> row.getField(ROW_DATA_POS), - true, - elementsPerCheckpoint, - expectedRecords, - branch); - } - - void testChangeLogOnIdKey(String branch) throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 1, "bbb"), - row("+I", 2, "aaa"), - row("-D", 2, "aaa"), - row("+I", 2, "bbb")), - ImmutableList.of( - row("-U", 2, "bbb"), row("+U", 2, "ccc"), row("-D", 2, "ccc"), row("+I", 2, "ddd")), - ImmutableList.of( - row("-D", 1, "bbb"), - row("+I", 1, "ccc"), - row("-D", 1, "ccc"), - row("+I", 1, "ddd"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(1, "bbb"), record(2, "bbb")), - ImmutableList.of(record(1, "bbb"), record(2, "ddd")), - ImmutableList.of(record(1, "ddd"), record(2, "ddd"))); - - if (partitioned && writeDistributionMode.equals(TableProperties.WRITE_DISTRIBUTION_MODE_HASH)) { - assertThatThrownBy( - () -> - testChangeLogs( - ImmutableList.of("id"), - row -> row.getField(ROW_ID_POS), - false, - elementsPerCheckpoint, - expectedRecords, - branch)) - .isInstanceOf(IllegalStateException.class) - .hasMessageStartingWith( - "In 'hash' distribution mode with equality fields set, source column") - .hasMessageContaining("should be included in equality fields:"); - - } else { - testChangeLogs( - ImmutableList.of("id"), - row -> row.getField(ROW_ID_POS), - false, - elementsPerCheckpoint, - expectedRecords, - branch); - } - } - - void testUpsertOnIdKey(String branch) throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of(row("+I", 1, "aaa"), row("+U", 1, "bbb")), - ImmutableList.of(row("+I", 1, "ccc")), - ImmutableList.of(row("+U", 1, "ddd"), row("+I", 1, "eee"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(1, "bbb")), - ImmutableList.of(record(1, "ccc")), - ImmutableList.of(record(1, "eee"))); - - if (!partitioned) { - testChangeLogs( - ImmutableList.of("id"), - row -> row.getField(ROW_ID_POS), - true, - elementsPerCheckpoint, - expectedRecords, - branch); - } else { - assertThatThrownBy( - () -> - testChangeLogs( - ImmutableList.of("id"), - row -> row.getField(ROW_ID_POS), - true, - elementsPerCheckpoint, - expectedRecords, - branch)) - .isInstanceOf(IllegalStateException.class) - .hasMessageContaining("should be included in equality fields:"); - } - } - - void testChangeLogs( - List equalityFieldColumns, - KeySelector keySelector, - boolean insertAsUpsert, - List> elementsPerCheckpoint, - List> expectedRecordsPerCheckpoint, - String branch) - throws Exception { - DataStream dataStream = - env.addSource(new BoundedTestSource<>(elementsPerCheckpoint), ROW_TYPE_INFO); - - if (isTableSchema) { - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .tableLoader(tableLoader) - .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) - .writeParallelism(parallelism) - .equalityFieldColumns(equalityFieldColumns) - .upsert(insertAsUpsert) - .toBranch(branch) - .append(); - } else { - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .tableLoader(tableLoader) - .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) - .writeParallelism(parallelism) - .equalityFieldColumns(equalityFieldColumns) - .upsert(insertAsUpsert) - .toBranch(branch) - .append(); - } - - // Execute the program. - env.execute("Test Iceberg Change-Log DataStream."); - - table.refresh(); - List snapshots = findValidSnapshots(); - int expectedSnapshotNum = expectedRecordsPerCheckpoint.size(); - assertThat(snapshots).hasSize(expectedSnapshotNum); - - for (int i = 0; i < expectedSnapshotNum; i++) { - long snapshotId = snapshots.get(i).snapshotId(); - List expectedRecords = expectedRecordsPerCheckpoint.get(i); - assertThat(actualRowSet(snapshotId, "*")) - .as("Should have the expected records for the checkpoint#" + i) - .isEqualTo(expectedRowSet(expectedRecords.toArray(new Record[0]))); - } - } - - Record record(int id, String data) { - return SimpleDataUtil.createRecord(id, data); - } - - List findValidSnapshots() { - List validSnapshots = Lists.newArrayList(); - for (Snapshot snapshot : table.snapshots()) { - if (snapshot.allManifests(table.io()).stream() - .anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { - validSnapshots.add(snapshot); - } - } - return validSnapshots; - } - - StructLikeSet expectedRowSet(Record... records) { - return SimpleDataUtil.expectedRowSet(table, records); - } - - StructLikeSet actualRowSet(long snapshotId, String... columns) throws IOException { - table.refresh(); - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - try (CloseableIterable reader = - IcebergGenerics.read(table).useSnapshot(snapshotId).select(columns).build()) { - reader.forEach(set::add); - } - return set; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java deleted file mode 100644 index 8ce3e1886f40..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.TestFixtures.DATABASE; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.extension.RegisterExtension; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestFlinkIcebergSinkV2Branch extends TestFlinkIcebergSinkV2Base { - @RegisterExtension - static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); - - @Parameter(index = 5) - protected String branch; - - @Parameters( - name = - "FileFormat={0}, Parallelism={1}, Partitioned={2}, WriteDistributionMode={3}, IsTableSchema={4}, Branch={5}") - public static Object[][] parameters() { - return new Object[][] { - // Remove after the deprecation of TableSchema - BEGIN - {FileFormat.AVRO, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, true, "main"}, - {FileFormat.AVRO, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, true, "testBranch"}, - // Remove after the deprecation of TableSchema - END - - {FileFormat.AVRO, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, false, "main"}, - { - FileFormat.AVRO, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE, false, "testBranch" - }, - }; - } - - @BeforeEach - public void before() throws IOException { - table = - CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - PartitionSpec.unpartitioned(), - ImmutableMap.of( - TableProperties.DEFAULT_FILE_FORMAT, - FileFormat.AVRO.name(), - TableProperties.FORMAT_VERSION, - "2")); - - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100); - - tableLoader = CATALOG_EXTENSION.tableLoader(); - } - - @TestTemplate - public void testChangeLogOnIdKey() throws Exception { - testChangeLogOnIdKey(branch); - verifyOtherBranchUnmodified(); - } - - @TestTemplate - public void testChangeLogOnDataKey() throws Exception { - testChangeLogOnDataKey(branch); - verifyOtherBranchUnmodified(); - } - - @TestTemplate - public void testChangeLogOnIdDataKey() throws Exception { - testChangeLogOnIdDataKey(branch); - verifyOtherBranchUnmodified(); - } - - @TestTemplate - public void testUpsertOnIdKey() throws Exception { - testUpsertOnIdKey(branch); - verifyOtherBranchUnmodified(); - } - - @TestTemplate - public void testUpsertOnDataKey() throws Exception { - testUpsertOnDataKey(branch); - verifyOtherBranchUnmodified(); - } - - @TestTemplate - public void testUpsertOnIdDataKey() throws Exception { - testUpsertOnIdDataKey(branch); - verifyOtherBranchUnmodified(); - } - - private void verifyOtherBranchUnmodified() { - String otherBranch = - branch.equals(SnapshotRef.MAIN_BRANCH) ? "test-branch" : SnapshotRef.MAIN_BRANCH; - if (otherBranch.equals(SnapshotRef.MAIN_BRANCH)) { - assertThat(table.currentSnapshot()).isNull(); - } - - assertThat(table.snapshot(otherBranch)).isNull(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2DistributionMode.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2DistributionMode.java deleted file mode 100644 index 0feb4cc282d2..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2DistributionMode.java +++ /dev/null @@ -1,618 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.FlinkWriteOptions; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.sink.shuffle.StatisticsType; -import org.apache.iceberg.flink.source.BoundedTestSource; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Conversions; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.extension.RegisterExtension; - -/** - * This tests the distribution mode of the IcebergSink. Extract them separately since it is - * unnecessary to test different file formats (Avro, Orc, Parquet) like in {@link TestIcebergSink}. - * Removing the file format dimension reduces the number of combinations from 12 to 4, which helps - * reduce test run time. - */ -@ExtendWith(ParameterizedTestExtension.class) -public class TestFlinkIcebergSinkV2DistributionMode extends TestFlinkIcebergSinkBase { - - @RegisterExtension - public static MiniClusterExtension miniClusterResource = - MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); - - @RegisterExtension - private static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); - - private final FileFormat format = FileFormat.PARQUET; - - @Parameter(index = 0) - private int parallelism; - - @Parameter(index = 1) - private boolean partitioned; - - @Parameter(index = 2) - private int writeParallelism; - - @Parameter(index = 3) - private boolean isTableSchema; - - @Parameters( - name = "parallelism = {0}, partitioned = {1}, writeParallelism = {2}, isTableSchema = {3}") - public static Object[][] parameters() { - return new Object[][] { - // Remove after the deprecation of TableSchema - BEGIN - {1, true, 1, true}, - {1, false, 1, true}, - {2, true, 2, true}, - {2, false, 2, true}, - {1, true, 2, true}, - {1, false, 2, true}, - // Remove after the deprecation of TableSchema - END - - {1, true, 1, false}, - {1, false, 1, false}, - {2, true, 2, false}, - {2, false, 2, false}, - {1, true, 2, false}, - {1, false, 2, false}, - }; - } - - @BeforeEach - public void before() throws IOException { - this.table = - CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - partitioned - ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() - : PartitionSpec.unpartitioned(), - ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); - - this.env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(Math.max(parallelism, writeParallelism)); - - this.tableLoader = CATALOG_EXTENSION.tableLoader(); - } - - @TestTemplate - public void testShuffleByPartitionWithSchema() throws Exception { - testWriteRow(parallelism, SimpleDataUtil.FLINK_SCHEMA, DistributionMode.HASH, isTableSchema); - if (partitioned) { - assertThat(partitionFiles("aaa")).isEqualTo(1); - assertThat(partitionFiles("bbb")).isEqualTo(1); - assertThat(partitionFiles("ccc")).isEqualTo(1); - } - } - - @TestTemplate - public void testJobNoneDistributeMode() throws Exception { - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) - .commit(); - - testWriteRow(parallelism, null, DistributionMode.NONE, isTableSchema); - - if (parallelism > 1) { - if (partitioned) { - int files = partitionFiles("aaa") + partitionFiles("bbb") + partitionFiles("ccc"); - assertThat(files).isGreaterThan(3); - } - } - } - - @TestTemplate - public void testJobNullDistributionMode() throws Exception { - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) - .commit(); - - testWriteRow(parallelism, null, null, isTableSchema); - - if (partitioned) { - assertThat(partitionFiles("aaa")).isEqualTo(1); - assertThat(partitionFiles("bbb")).isEqualTo(1); - assertThat(partitionFiles("ccc")).isEqualTo(1); - } - } - - @TestTemplate - public void testPartitionWriteMode() throws Exception { - testWriteRow(parallelism, null, DistributionMode.HASH, isTableSchema); - if (partitioned) { - assertThat(partitionFiles("aaa")).isEqualTo(1); - assertThat(partitionFiles("bbb")).isEqualTo(1); - assertThat(partitionFiles("ccc")).isEqualTo(1); - } - } - - @TestTemplate - public void testOverrideWriteConfigWithUnknownDistributionMode() { - Map newProps = Maps.newHashMap(); - newProps.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), "UNRECOGNIZED"); - - List rows = createRows(""); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - - if (isTableSchema) { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .setAll(newProps) - .append(); - } else { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .setAll(newProps) - .append(); - } - - assertThatThrownBy(env::execute) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Invalid distribution mode: UNRECOGNIZED"); - } - - @TestTemplate - public void testRangeDistributionWithoutSortOrderUnpartitioned() throws Exception { - assumeThat(partitioned).isFalse(); - - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) - .commit(); - - int numOfCheckpoints = 6; - DataStream dataStream = - env.addSource( - createRangeDistributionBoundedSource(createCharRows(numOfCheckpoints, 10)), - ROW_TYPE_INFO); - - if (isTableSchema) { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .append(); - } else { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .append(); - } - - // Range distribution requires either sort order or partition spec defined - assertThatThrownBy(env::execute) - .isInstanceOf(IllegalStateException.class) - .hasMessage( - "Invalid write distribution mode: range. Need to define sort order or partition spec."); - } - - @TestTemplate - public void testRangeDistributionWithoutSortOrderPartitioned() throws Exception { - assumeThat(partitioned).isTrue(); - - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) - .commit(); - - int numOfCheckpoints = 6; - DataStream dataStream = - env.addSource( - createRangeDistributionBoundedSource(createCharRows(numOfCheckpoints, 10)), - ROW_TYPE_INFO); - - if (isTableSchema) { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .append(); - } else { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .append(); - } - - // sort based on partition columns - env.execute(getClass().getSimpleName()); - - table.refresh(); - // ordered in reverse timeline from the newest snapshot to the oldest snapshot - List snapshots = Lists.newArrayList(table.snapshots().iterator()); - // only keep the snapshots with added data files - snapshots = - snapshots.stream() - .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) - .collect(Collectors.toList()); - - // Sometimes we will have more checkpoints than the bounded source if we pass the - // auto checkpoint interval. Thus producing multiple snapshots. - assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); - } - - @TestTemplate - public void testRangeDistributionWithNullValue() throws Exception { - assumeThat(partitioned).isTrue(); - - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) - .commit(); - - int numOfCheckpoints = 6; - List> charRows = createCharRows(numOfCheckpoints, 10); - charRows.add(ImmutableList.of(Row.of(1, null))); - DataStream dataStream = - env.addSource(createRangeDistributionBoundedSource(charRows), ROW_TYPE_INFO); - - if (isTableSchema) { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .append(); - } else { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .append(); - } - - // sort based on partition columns - env.execute(getClass().getSimpleName()); - - table.refresh(); - // ordered in reverse timeline from the newest snapshot to the oldest snapshot - List snapshots = Lists.newArrayList(table.snapshots().iterator()); - // only keep the snapshots with added data files - snapshots = - snapshots.stream() - .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) - .collect(Collectors.toList()); - - // Sometimes we will have more checkpoints than the bounded source if we pass the - // auto checkpoint interval. Thus producing multiple snapshots. - assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); - } - - @TestTemplate - public void testRangeDistributionWithSortOrder() throws Exception { - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) - .commit(); - table.replaceSortOrder().asc("data").commit(); - - int numOfCheckpoints = 6; - DataStream dataStream = - env.addSource( - createRangeDistributionBoundedSource(createCharRows(numOfCheckpoints, 10)), - ROW_TYPE_INFO); - - if (isTableSchema) { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .rangeDistributionStatisticsType(StatisticsType.Map) - .append(); - } else { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .rangeDistributionStatisticsType(StatisticsType.Map) - .append(); - } - - env.execute(getClass().getSimpleName()); - - table.refresh(); - // ordered in reverse timeline from the newest snapshot to the oldest snapshot - List snapshots = Lists.newArrayList(table.snapshots().iterator()); - // only keep the snapshots with added data files - snapshots = - snapshots.stream() - .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) - .collect(Collectors.toList()); - - // Sometimes we will have more checkpoints than the bounded source if we pass the - // auto checkpoint interval. Thus producing multiple snapshots. - assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); - - // It takes 2 checkpoint cycle for statistics collection and application - // of the globally aggregated statistics in the range partitioner. - // The last two checkpoints should have range shuffle applied - List rangePartitionedCycles = - snapshots.subList(snapshots.size() - 2, snapshots.size()); - - if (partitioned) { - for (Snapshot snapshot : rangePartitionedCycles) { - List addedDataFiles = - Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); - // up to 26 partitions - assertThat(addedDataFiles).hasSizeLessThanOrEqualTo(26); - } - } else { - for (Snapshot snapshot : rangePartitionedCycles) { - List addedDataFiles = - Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); - // each writer task should only write one file for non-partition sort column - assertThat(addedDataFiles).hasSize(writeParallelism); - // verify there is no overlap in min-max stats range - if (parallelism > 1) { - assertIdColumnStatsNoRangeOverlap(addedDataFiles.get(0), addedDataFiles.get(1)); - } - } - } - } - - @TestTemplate - public void testRangeDistributionSketchWithSortOrder() throws Exception { - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) - .commit(); - table.replaceSortOrder().asc("id").commit(); - - int numOfCheckpoints = 6; - DataStream dataStream = - env.addSource( - createRangeDistributionBoundedSource(createIntRows(numOfCheckpoints, 1_000)), - ROW_TYPE_INFO); - - if (isTableSchema) { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .rangeDistributionStatisticsType(StatisticsType.Sketch) - .append(); - } else { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .rangeDistributionStatisticsType(StatisticsType.Sketch) - .append(); - } - - env.execute(getClass().getSimpleName()); - - table.refresh(); - // ordered in reverse timeline from the newest snapshot to the oldest snapshot - List snapshots = Lists.newArrayList(table.snapshots().iterator()); - // only keep the snapshots with added data files - snapshots = - snapshots.stream() - .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) - .collect(Collectors.toList()); - - // Sometimes we will have more checkpoints than the bounded source if we pass the - // auto checkpoint interval. Thus producing multiple snapshots. - assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); - - // It takes 2 checkpoint cycle for statistics collection and application - // of the globally aggregated statistics in the range partitioner. - // The last two checkpoints should have range shuffle applied - List rangePartitionedCycles = - snapshots.subList(snapshots.size() - 2, snapshots.size()); - - // since the input has a single value for the data column, - // it is always the same partition. Hence there is no difference - // for partitioned or not - for (Snapshot snapshot : rangePartitionedCycles) { - List addedDataFiles = - Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); - // each writer task should only write one file for non-partition sort column - assertThat(addedDataFiles).hasSize(writeParallelism); - // verify there is no overlap in min-max stats range - if (writeParallelism > 2) { - assertIdColumnStatsNoRangeOverlap(addedDataFiles.get(0), addedDataFiles.get(1)); - } - } - } - - /** Test migration from Map stats to Sketch stats */ - @TestTemplate - public void testRangeDistributionStatisticsMigration() throws Exception { - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.RANGE.modeName()) - .commit(); - table.replaceSortOrder().asc("id").commit(); - - int numOfCheckpoints = 6; - List> rowsPerCheckpoint = Lists.newArrayListWithCapacity(numOfCheckpoints); - for (int checkpointId = 0; checkpointId < numOfCheckpoints; ++checkpointId) { - // checkpointId 2 would emit 11_000 records which is larger than - // the OPERATOR_SKETCH_SWITCH_THRESHOLD of 10_000. - // This should trigger the stats migration. - int maxId = checkpointId < 2 ? 1_000 : 11_000; - List rows = Lists.newArrayListWithCapacity(maxId); - for (int j = 0; j < maxId; ++j) { - // fixed value "a" for the data (possible partition column) - rows.add(Row.of(j, "a")); - } - - rowsPerCheckpoint.add(rows); - } - - DataStream dataStream = - env.addSource(createRangeDistributionBoundedSource(rowsPerCheckpoint), ROW_TYPE_INFO); - - if (isTableSchema) { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .rangeDistributionStatisticsType(StatisticsType.Auto) - .append(); - } else { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(writeParallelism) - .rangeDistributionStatisticsType(StatisticsType.Auto) - .append(); - } - - env.execute(getClass().getSimpleName()); - - table.refresh(); - // ordered in reverse timeline from the newest snapshot to the oldest snapshot - List snapshots = Lists.newArrayList(table.snapshots().iterator()); - // only keep the snapshots with added data files - snapshots = - snapshots.stream() - .filter(snapshot -> snapshot.addedDataFiles(table.io()).iterator().hasNext()) - .collect(Collectors.toList()); - - // Sometimes we will have more checkpoints than the bounded source if we pass the - // auto checkpoint interval. Thus producing multiple snapshots. - assertThat(snapshots).hasSizeGreaterThanOrEqualTo(numOfCheckpoints); - - // It takes 2 checkpoint cycle for statistics collection and application - // of the globally aggregated statistics in the range partitioner. - // The last two checkpoints should have range shuffle applied - List rangePartitionedCycles = - snapshots.subList(snapshots.size() - 2, snapshots.size()); - - // since the input has a single value for the data column, - // it is always the same partition. Hence there is no difference - // for partitioned or not - for (Snapshot snapshot : rangePartitionedCycles) { - List addedDataFiles = - Lists.newArrayList(snapshot.addedDataFiles(table.io()).iterator()); - // each writer task should only write one file for non-partition sort column - // sometimes - assertThat(addedDataFiles).hasSize(writeParallelism); - // verify there is no overlap in min-max stats range - if (writeParallelism > 1) { - assertIdColumnStatsNoRangeOverlap(addedDataFiles.get(0), addedDataFiles.get(1)); - } - } - } - - private BoundedTestSource createRangeDistributionBoundedSource( - List> rowsPerCheckpoint) { - return new BoundedTestSource<>(rowsPerCheckpoint); - } - - private List> createCharRows(int numOfCheckpoints, int countPerChar) { - List> rowsPerCheckpoint = Lists.newArrayListWithCapacity(numOfCheckpoints); - for (int checkpointId = 0; checkpointId < numOfCheckpoints; ++checkpointId) { - List rows = Lists.newArrayListWithCapacity(26 * countPerChar); - for (int j = 0; j < countPerChar; ++j) { - for (char c = 'a'; c <= 'z'; ++c) { - rows.add(Row.of(1, String.valueOf(c))); - } - } - - rowsPerCheckpoint.add(rows); - } - - return rowsPerCheckpoint; - } - - private List> createIntRows(int numOfCheckpoints, int maxId) { - List> rowsPerCheckpoint = Lists.newArrayListWithCapacity(numOfCheckpoints); - for (int checkpointId = 0; checkpointId < numOfCheckpoints; ++checkpointId) { - List rows = Lists.newArrayListWithCapacity(maxId); - for (int j = 0; j < maxId; ++j) { - // fixed value "a" for the data (possible partition column) - rows.add(Row.of(j, "a")); - } - - rowsPerCheckpoint.add(rows); - } - - return rowsPerCheckpoint; - } - - private void assertIdColumnStatsNoRangeOverlap(DataFile file1, DataFile file2) { - // id column has fieldId 1 - int file1LowerBound = - Conversions.fromByteBuffer(Types.IntegerType.get(), file1.lowerBounds().get(1)); - int file1UpperBound = - Conversions.fromByteBuffer(Types.IntegerType.get(), file1.upperBounds().get(1)); - int file2LowerBound = - Conversions.fromByteBuffer(Types.IntegerType.get(), file2.lowerBounds().get(1)); - int file2UpperBound = - Conversions.fromByteBuffer(Types.IntegerType.get(), file2.upperBounds().get(1)); - - if (file1LowerBound < file2LowerBound) { - assertThat(file1UpperBound).isLessThanOrEqualTo(file2LowerBound); - } else { - assertThat(file2UpperBound).isLessThanOrEqualTo(file1LowerBound); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java deleted file mode 100644 index c21c3d5cc21b..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java +++ /dev/null @@ -1,313 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import java.util.concurrent.atomic.AtomicInteger; -import org.apache.flink.core.io.SimpleVersionedSerialization; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.table.data.RowData; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.Pair; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public class TestFlinkManifest { - private static final Configuration CONF = new Configuration(); - - @TempDir protected Path temporaryFolder; - - private Table table; - private FileAppenderFactory appenderFactory; - private final AtomicInteger fileCount = new AtomicInteger(0); - - @BeforeEach - public void before() throws IOException { - File folder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); - String warehouse = folder.getAbsolutePath(); - - String tablePath = warehouse.concat("/test"); - assertThat(new File(tablePath).mkdir()).isTrue(); - - // Construct the iceberg table. - table = SimpleDataUtil.createTable(tablePath, ImmutableMap.of(), false); - - int[] equalityFieldIds = - new int[] { - table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId() - }; - this.appenderFactory = - new FlinkAppenderFactory( - table, - table.schema(), - FlinkSchemaUtil.convert(table.schema()), - table.properties(), - table.spec(), - equalityFieldIds, - table.schema(), - null); - } - - @Test - public void testIO() throws IOException { - String flinkJobId = newFlinkJobId(); - String operatorId = newOperatorUniqueId(); - for (long checkpointId = 1; checkpointId <= 3; checkpointId++) { - ManifestOutputFileFactory factory = - FlinkManifestUtil.createOutputFileFactory( - () -> table, table.properties(), flinkJobId, operatorId, 1, 1); - final long curCkpId = checkpointId; - - List dataFiles = generateDataFiles(10); - List eqDeleteFiles = generateEqDeleteFiles(5); - List posDeleteFiles = generatePosDeleteFiles(5); - DeltaManifests deltaManifests = - FlinkManifestUtil.writeCompletedFiles( - WriteResult.builder() - .addDataFiles(dataFiles) - .addDeleteFiles(eqDeleteFiles) - .addDeleteFiles(posDeleteFiles) - .build(), - () -> factory.create(curCkpId), - table.spec()); - - WriteResult result = - FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io(), table.specs()); - assertThat(result.deleteFiles()).hasSize(10); - for (int i = 0; i < dataFiles.size(); i++) { - TestHelpers.assertEquals(dataFiles.get(i), result.dataFiles()[i]); - } - assertThat(result.deleteFiles()).hasSize(10); - for (int i = 0; i < 5; i++) { - TestHelpers.assertEquals(eqDeleteFiles.get(i), result.deleteFiles()[i]); - } - for (int i = 0; i < 5; i++) { - TestHelpers.assertEquals(posDeleteFiles.get(i), result.deleteFiles()[5 + i]); - } - } - } - - @Test - public void testUserProvidedManifestLocation() throws IOException { - long checkpointId = 1; - String flinkJobId = newFlinkJobId(); - String operatorId = newOperatorUniqueId(); - File userProvidedFolder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); - Map props = - ImmutableMap.of( - ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION, - userProvidedFolder.getAbsolutePath() + "///"); - ManifestOutputFileFactory factory = - new ManifestOutputFileFactory(() -> table, props, flinkJobId, operatorId, 1, 1); - - List dataFiles = generateDataFiles(5); - DeltaManifests deltaManifests = - FlinkManifestUtil.writeCompletedFiles( - WriteResult.builder().addDataFiles(dataFiles).build(), - () -> factory.create(checkpointId), - table.spec()); - - assertThat(deltaManifests.dataManifest()).isNotNull(); - assertThat(deltaManifests.deleteManifest()).isNull(); - assertThat(Paths.get(deltaManifests.dataManifest().path())) - .hasParent(userProvidedFolder.toPath()); - - WriteResult result = - FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io(), table.specs()); - - assertThat(result.deleteFiles()).isEmpty(); - assertThat(result.dataFiles()).hasSize(5); - - assertThat(result.dataFiles()).hasSameSizeAs(dataFiles); - for (int i = 0; i < dataFiles.size(); i++) { - TestHelpers.assertEquals(dataFiles.get(i), result.dataFiles()[i]); - } - } - - @Test - public void testVersionedSerializer() throws IOException { - long checkpointId = 1; - String flinkJobId = newFlinkJobId(); - String operatorId = newOperatorUniqueId(); - ManifestOutputFileFactory factory = - FlinkManifestUtil.createOutputFileFactory( - () -> table, table.properties(), flinkJobId, operatorId, 1, 1); - - List dataFiles = generateDataFiles(10); - List eqDeleteFiles = generateEqDeleteFiles(10); - List posDeleteFiles = generatePosDeleteFiles(10); - DeltaManifests expected = - FlinkManifestUtil.writeCompletedFiles( - WriteResult.builder() - .addDataFiles(dataFiles) - .addDeleteFiles(eqDeleteFiles) - .addDeleteFiles(posDeleteFiles) - .build(), - () -> factory.create(checkpointId), - table.spec()); - - byte[] versionedSerializeData = - SimpleVersionedSerialization.writeVersionAndSerialize( - DeltaManifestsSerializer.INSTANCE, expected); - DeltaManifests actual = - SimpleVersionedSerialization.readVersionAndDeSerialize( - DeltaManifestsSerializer.INSTANCE, versionedSerializeData); - TestHelpers.assertEquals(expected.dataManifest(), actual.dataManifest()); - TestHelpers.assertEquals(expected.deleteManifest(), actual.deleteManifest()); - - byte[] versionedSerializeData2 = - SimpleVersionedSerialization.writeVersionAndSerialize( - DeltaManifestsSerializer.INSTANCE, actual); - assertThat(versionedSerializeData2).containsExactly(versionedSerializeData); - } - - @Test - public void testCompatibility() throws IOException { - // The v2 deserializer should be able to deserialize the v1 binary. - long checkpointId = 1; - String flinkJobId = newFlinkJobId(); - String operatorId = newOperatorUniqueId(); - ManifestOutputFileFactory factory = - FlinkManifestUtil.createOutputFileFactory( - () -> table, table.properties(), flinkJobId, operatorId, 1, 1); - - List dataFiles = generateDataFiles(10); - ManifestFile manifest = - FlinkManifestUtil.writeDataFiles(factory.create(checkpointId), table.spec(), dataFiles); - byte[] dataV1 = - SimpleVersionedSerialization.writeVersionAndSerialize(new V1Serializer(), manifest); - - DeltaManifests delta = - SimpleVersionedSerialization.readVersionAndDeSerialize( - DeltaManifestsSerializer.INSTANCE, dataV1); - assertThat(delta.deleteManifest()).isNull(); - assertThat(delta.dataManifest()).isNotNull(); - TestHelpers.assertEquals(manifest, delta.dataManifest()); - - List actualFiles = - FlinkManifestUtil.readDataFiles(delta.dataManifest(), table.io(), table.specs()); - assertThat(actualFiles).hasSize(10); - for (int i = 0; i < 10; i++) { - TestHelpers.assertEquals(dataFiles.get(i), actualFiles.get(i)); - } - } - - private static class V1Serializer implements SimpleVersionedSerializer { - - @Override - public int getVersion() { - return 1; - } - - @Override - public byte[] serialize(ManifestFile m) throws IOException { - return ManifestFiles.encode(m); - } - - @Override - public ManifestFile deserialize(int version, byte[] serialized) throws IOException { - return ManifestFiles.decode(serialized); - } - } - - private DataFile writeDataFile(String filename, List rows) throws IOException { - return SimpleDataUtil.writeFile( - table, - table.schema(), - table.spec(), - CONF, - table.location(), - FileFormat.PARQUET.addExtension(filename), - rows); - } - - private DeleteFile writeEqDeleteFile(String filename, List deletes) throws IOException { - return SimpleDataUtil.writeEqDeleteFile( - table, FileFormat.PARQUET, filename, appenderFactory, deletes); - } - - private DeleteFile writePosDeleteFile(String filename, List> positions) - throws IOException { - return SimpleDataUtil.writePosDeleteFile( - table, FileFormat.PARQUET, filename, appenderFactory, positions); - } - - private List generateDataFiles(int fileNum) throws IOException { - List rowDataList = Lists.newArrayList(); - List dataFiles = Lists.newArrayList(); - for (int i = 0; i < fileNum; i++) { - rowDataList.add(SimpleDataUtil.createRowData(i, "a" + i)); - dataFiles.add(writeDataFile("data-file-" + fileCount.incrementAndGet(), rowDataList)); - } - return dataFiles; - } - - private List generateEqDeleteFiles(int fileNum) throws IOException { - List rowDataList = Lists.newArrayList(); - List deleteFiles = Lists.newArrayList(); - for (int i = 0; i < fileNum; i++) { - rowDataList.add(SimpleDataUtil.createDelete(i, "a" + i)); - deleteFiles.add( - writeEqDeleteFile("eq-delete-file-" + fileCount.incrementAndGet(), rowDataList)); - } - return deleteFiles; - } - - private List generatePosDeleteFiles(int fileNum) throws IOException { - List> positions = Lists.newArrayList(); - List deleteFiles = Lists.newArrayList(); - for (int i = 0; i < fileNum; i++) { - positions.add(Pair.of("data-file-1", (long) i)); - deleteFiles.add( - writePosDeleteFile("pos-delete-file-" + fileCount.incrementAndGet(), positions)); - } - return deleteFiles; - } - - private static String newFlinkJobId() { - return UUID.randomUUID().toString(); - } - - private static String newOperatorUniqueId() { - return UUID.randomUUID().toString(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java deleted file mode 100644 index 939ed2be7dbc..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.Arrays; -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestPartitioningWriters; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.iceberg.util.StructLikeSet; - -public class TestFlinkPartitioningWriters extends TestPartitioningWriters { - - @Parameters(name = "formatVersion = {0}, fileFormat = {1}") - protected static List parameters() { - return Arrays.asList( - new Object[] {2, FileFormat.AVRO}, - new Object[] {2, FileFormat.PARQUET}, - new Object[] {2, FileFormat.ORC}); - } - - @Override - protected FileWriterFactory newWriterFactory( - Schema dataSchema, - List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { - return FlinkFileWriterFactory.builderFor(table) - .dataSchema(table.schema()) - .dataFileFormat(format()) - .deleteFileFormat(format()) - .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) - .equalityDeleteRowSchema(equalityDeleteRowSchema) - .positionDeleteRowSchema(positionDeleteRowSchema) - .build(); - } - - @Override - protected RowData toRow(Integer id, String data) { - return SimpleDataUtil.createRowData(id, data); - } - - @Override - protected StructLikeSet toSet(Iterable rows) { - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - RowType flinkType = FlinkSchemaUtil.convert(table.schema()); - for (RowData row : rows) { - RowDataWrapper wrapper = new RowDataWrapper(flinkType, table.schema().asStruct()); - set.add(wrapper.wrap(row)); - } - return set; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java deleted file mode 100644 index 3050752d1c24..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestPositionDeltaWriters; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.iceberg.util.StructLikeSet; - -public class TestFlinkPositionDeltaWriters extends TestPositionDeltaWriters { - - @Override - protected FileWriterFactory newWriterFactory( - Schema dataSchema, - List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { - return FlinkFileWriterFactory.builderFor(table) - .dataSchema(table.schema()) - .dataFileFormat(format()) - .deleteFileFormat(format()) - .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) - .equalityDeleteRowSchema(equalityDeleteRowSchema) - .positionDeleteRowSchema(positionDeleteRowSchema) - .build(); - } - - @Override - protected RowData toRow(Integer id, String data) { - return SimpleDataUtil.createRowData(id, data); - } - - @Override - protected StructLikeSet toSet(Iterable rows) { - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - RowType flinkType = FlinkSchemaUtil.convert(table.schema()); - for (RowData row : rows) { - RowDataWrapper wrapper = new RowDataWrapper(flinkType, table.schema().asStruct()); - set.add(wrapper.wrap(row)); - } - return set; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java deleted file mode 100644 index 03051b69cf87..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestRollingFileWriters; -import org.apache.iceberg.util.ArrayUtil; - -public class TestFlinkRollingFileWriters extends TestRollingFileWriters { - - @Override - protected FileWriterFactory newWriterFactory( - Schema dataSchema, - List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { - return FlinkFileWriterFactory.builderFor(table) - .dataSchema(table.schema()) - .dataFileFormat(format()) - .deleteFileFormat(format()) - .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) - .equalityDeleteRowSchema(equalityDeleteRowSchema) - .positionDeleteRowSchema(positionDeleteRowSchema) - .build(); - } - - @Override - protected RowData toRow(Integer id, String data) { - return SimpleDataUtil.createRowData(id, data); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java deleted file mode 100644 index e6d64ef2c720..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Table; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestWriterMetrics; - -public class TestFlinkWriterMetrics extends TestWriterMetrics { - - public TestFlinkWriterMetrics(FileFormat fileFormat) { - super(fileFormat); - } - - @Override - protected FileWriterFactory newWriterFactory(Table sourceTable) { - return FlinkFileWriterFactory.builderFor(sourceTable) - .dataSchema(sourceTable.schema()) - .dataFileFormat(fileFormat) - .deleteFileFormat(fileFormat) - .positionDeleteRowSchema(sourceTable.schema()) - .build(); - } - - @Override - protected RowData toRow(Integer id, String data, boolean boolValue, Long longValue) { - GenericRowData nested = GenericRowData.of(boolValue, longValue); - GenericRowData row = GenericRowData.of(id, StringData.fromString(data), nested); - return row; - } - - @Override - public RowData toGenericRow(int value, int repeated) { - GenericRowData row = new GenericRowData(repeated); - for (int i = 0; i < repeated; i++) { - row.setField(i, value); - } - return row; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergCommitter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergCommitter.java deleted file mode 100644 index 584b9962eaee..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergCommitter.java +++ /dev/null @@ -1,1446 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; -import static org.apache.iceberg.flink.sink.SinkTestUtil.extractAndAssertCommittableSummary; -import static org.apache.iceberg.flink.sink.SinkTestUtil.extractAndAssertCommittableWithLineage; -import static org.apache.iceberg.flink.sink.SinkTestUtil.transformsToStreamElement; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assumptions.assumeThat; -import static org.mockito.Mockito.doReturn; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.spy; - -import java.io.File; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import org.apache.flink.api.common.JobID; -import org.apache.flink.api.common.TaskInfo; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; -import org.apache.flink.api.connector.sink2.Committer; -import org.apache.flink.core.memory.DataInputView; -import org.apache.flink.core.memory.DataOutputView; -import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; -import org.apache.flink.runtime.execution.Environment; -import org.apache.flink.runtime.jobgraph.OperatorID; -import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; -import org.apache.flink.streaming.api.connector.sink2.CommittableMessageSerializer; -import org.apache.flink.streaming.api.connector.sink2.CommittableSummary; -import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; -import org.apache.flink.streaming.api.connector.sink2.SinkV2Assertions; -import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; -import org.apache.flink.streaming.runtime.operators.sink.CommitterOperatorFactory; -import org.apache.flink.streaming.runtime.streamrecord.StreamElement; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.streaming.runtime.tasks.StreamTask; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.table.data.RowData; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.ManifestWriter; -import org.apache.iceberg.Metrics; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.SnapshotSummary; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.TestBase; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -@ExtendWith(ParameterizedTestExtension.class) -class TestIcebergCommitter extends TestBase { - private static final Logger LOG = LoggerFactory.getLogger(TestIcebergCommitter.class); - public static final String OPERATOR_ID = "flink-sink"; - @TempDir File temporaryFolder; - - @TempDir File flinkManifestFolder; - - private Table table; - - private TableLoader tableLoader; - - @Parameter(index = 1) - private Boolean isStreamingMode; - - @Parameter(index = 2) - private String branch; - - private final String jobId = "jobId"; - private final long dataFIleRowCount = 5L; - - private final TestCommittableMessageTypeSerializer committableMessageTypeSerializer = - new TestCommittableMessageTypeSerializer(); - - private final DataFile dataFileTest1 = - DataFiles.builder(PartitionSpec.unpartitioned()) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(0) - .withMetrics( - new Metrics( - dataFIleRowCount, - null, // no column sizes - ImmutableMap.of(1, 5L), // value count - ImmutableMap.of(1, 0L), // null count - null, - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(4L)) // upper bounds - )) - .build(); - - private final DataFile dataFileTest2 = - DataFiles.builder(PartitionSpec.unpartitioned()) - .withPath("/path/to/data-2.parquet") - .withFileSizeInBytes(0) - .withMetrics( - new Metrics( - dataFIleRowCount, - null, // no column sizes - ImmutableMap.of(1, 5L), // value count - ImmutableMap.of(1, 0L), // null count - null, - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(4L)) // upper bounds - )) - .build(); - - @SuppressWarnings("checkstyle:NestedForDepth") - @Parameters(name = "formatVersion={0} isStreaming={1}, branch={2}") - protected static List parameters() { - List parameters = Lists.newArrayList(); - for (Boolean isStreamingMode : new Boolean[] {true, false}) { - for (int formatVersion : org.apache.iceberg.TestHelpers.ALL_VERSIONS) { - parameters.add(new Object[] {formatVersion, isStreamingMode, SnapshotRef.MAIN_BRANCH}); - parameters.add(new Object[] {formatVersion, isStreamingMode, "test-branch"}); - } - } - return parameters; - } - - @BeforeEach - public void before() throws Exception { - String warehouse = temporaryFolder.getAbsolutePath(); - - String tablePath = warehouse.concat("/test"); - assertThat(new File(tablePath).mkdir()).as("Should create the table path correctly.").isTrue(); - - Map props = - ImmutableMap.of( - TableProperties.FORMAT_VERSION, - String.valueOf(formatVersion), - FLINK_MANIFEST_LOCATION, - flinkManifestFolder.getAbsolutePath(), - IcebergCommitter.MAX_CONTINUOUS_EMPTY_COMMITS, - "1"); - table = SimpleDataUtil.createTable(tablePath, props, false); - tableLoader = TableLoader.fromHadoopTable(tablePath); - } - - @TestTemplate - public void testCommitTxnWithoutDataFiles() throws Exception { - IcebergCommitter committer = getCommitter(); - SimpleDataUtil.assertTableRows(table, Lists.newArrayList(), branch); - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(jobId, -1); - - for (long i = 1; i <= 3; i++) { - Committer.CommitRequest commitRequest = - buildCommitRequestFor(jobId, i, Lists.newArrayList()); - committer.commit(Lists.newArrayList(commitRequest)); - assertMaxCommittedCheckpointId(jobId, i); - assertSnapshotSize((int) i); - } - } - - @TestTemplate - public void testMxContinuousEmptyCommits() throws Exception { - table.updateProperties().set(IcebergCommitter.MAX_CONTINUOUS_EMPTY_COMMITS, "3").commit(); - IcebergCommitter committer = getCommitter(); - for (int i = 1; i <= 9; i++) { - Committer.CommitRequest commitRequest = - buildCommitRequestFor(jobId, i, Lists.newArrayList()); - committer.commit(Lists.newArrayList(commitRequest)); - assertFlinkManifests(0); - assertSnapshotSize(i / 3); - } - } - - @TestTemplate - public void testCommitTxn() throws Exception { - IcebergCommitter committer = getCommitter(); - assertSnapshotSize(0); - List rows = Lists.newArrayListWithExpectedSize(3); - for (int i = 1; i <= 3; i++) { - RowData rowData = SimpleDataUtil.createRowData(i, "hello" + i); - DataFile dataFile = writeDataFile("data-" + i, ImmutableList.of(rowData)); - rows.add(rowData); - WriteResult writeResult = of(dataFile); - Committer.CommitRequest commitRequest = - buildCommitRequestFor(jobId, i, Lists.newArrayList(writeResult)); - committer.commit(Lists.newArrayList(commitRequest)); - assertFlinkManifests(0); - SimpleDataUtil.assertTableRows(table, ImmutableList.copyOf(rows), branch); - assertSnapshotSize(i); - assertMaxCommittedCheckpointId(jobId, i); - Map summary = SimpleDataUtil.latestSnapshot(table, branch).summary(); - assertThat(summary) - .containsEntry("flink.test", "org.apache.iceberg.flink.sink.TestIcebergCommitter") - .containsEntry("added-data-files", "1") - .containsEntry("flink.operator-id", OPERATOR_ID) - .containsEntry("flink.job-id", "jobId"); - } - } - - @TestTemplate - public void testOrderedEventsBetweenCheckpoints() throws Exception { - // It's possible that two checkpoints happen in the following orders: - // 1. snapshotState for checkpoint#1; - // 2. snapshotState for checkpoint#2; - // 3. notifyCheckpointComplete for checkpoint#1; - // 4. notifyCheckpointComplete for checkpoint#2; - - long timestamp = 0; - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - harness = getTestHarness()) { - - harness.open(); - - assertMaxCommittedCheckpointId(jobId, -1L); - - RowData row1 = SimpleDataUtil.createRowData(1, "hello"); - DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); - - processElement(jobId, 1, harness, 1, OPERATOR_ID, dataFile1); - assertMaxCommittedCheckpointId(jobId, -1L); - - // 1. snapshotState for checkpoint#1 - long firstCheckpointId = 1; - harness.snapshot(firstCheckpointId, ++timestamp); - assertFlinkManifests(1); - - RowData row2 = SimpleDataUtil.createRowData(2, "world"); - DataFile dataFile2 = writeDataFile("data-2", ImmutableList.of(row2)); - processElement(jobId, 2, harness, 1, OPERATOR_ID, dataFile2); - assertMaxCommittedCheckpointId(jobId, -1L); - - // 2. snapshotState for checkpoint#2 - long secondCheckpointId = 2; - OperatorSubtaskState snapshot = harness.snapshot(secondCheckpointId, ++timestamp); - assertFlinkManifests(2); - - // 3. notifyCheckpointComplete for checkpoint#1 - harness.notifyOfCompletedCheckpoint(firstCheckpointId); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); - assertMaxCommittedCheckpointId(jobId, firstCheckpointId); - assertFlinkManifests(1); - - // 4. notifyCheckpointComplete for checkpoint#2 - harness.notifyOfCompletedCheckpoint(secondCheckpointId); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2), branch); - assertMaxCommittedCheckpointId(jobId, secondCheckpointId); - assertFlinkManifests(0); - } - } - - @TestTemplate - public void testDisorderedEventsBetweenCheckpoints() throws Exception { - // It's possible that two checkpoints happen in the following orders: - // 1. snapshotState for checkpoint#1; - // 2. snapshotState for checkpoint#2; - // 3. notifyCheckpointComplete for checkpoint#2; - // 4. notifyCheckpointComplete for checkpoint#1; - - long timestamp = 0; - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - harness = getTestHarness()) { - - harness.open(); - assertMaxCommittedCheckpointId(jobId, -1L); - - RowData row1 = SimpleDataUtil.createRowData(1, "hello"); - DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); - - processElement(jobId, 1, harness, 1, OPERATOR_ID, dataFile1); - assertMaxCommittedCheckpointId(jobId, -1L); - - // 1. snapshotState for checkpoint#1 - long firstCheckpointId = 1; - harness.snapshot(firstCheckpointId, ++timestamp); - assertFlinkManifests(1); - - RowData row2 = SimpleDataUtil.createRowData(2, "world"); - DataFile dataFile2 = writeDataFile("data-2", ImmutableList.of(row2)); - processElement(jobId, 2, harness, 1, OPERATOR_ID, dataFile2); - assertMaxCommittedCheckpointId(jobId, -1L); - - // 2. snapshotState for checkpoint#2 - long secondCheckpointId = 2; - harness.snapshot(secondCheckpointId, ++timestamp); - assertFlinkManifests(2); - - // 3. notifyCheckpointComplete for checkpoint#2 - harness.notifyOfCompletedCheckpoint(secondCheckpointId); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2), branch); - assertMaxCommittedCheckpointId(jobId, secondCheckpointId); - assertFlinkManifests(0); - - // 4. notifyCheckpointComplete for checkpoint#1 - harness.notifyOfCompletedCheckpoint(firstCheckpointId); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2), branch); - assertMaxCommittedCheckpointId(jobId, secondCheckpointId); - assertFlinkManifests(0); - } - } - - @TestTemplate - public void testSingleCommit() throws Exception { - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - testHarness = getTestHarness()) { - testHarness.open(); - - long checkpointId = 1; - - RowData row1 = SimpleDataUtil.createRowData(1, "hello1"); - DataFile dataFile1 = writeDataFile("data-1-1", ImmutableList.of(row1)); - CommittableSummary committableSummary = - processElement(jobId, checkpointId, testHarness, 1, OPERATOR_ID, dataFile1); - - // Trigger commit - testHarness.notifyOfCompletedCheckpoint(checkpointId); - - assertSnapshotSize(1); - assertMaxCommittedCheckpointId(jobId, 1L); - - List output = transformsToStreamElement(testHarness.getOutput()); - - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); - SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output.get(0))) - .hasFailedCommittables(committableSummary.getNumberOfFailedCommittables()) - .hasOverallCommittables(committableSummary.getNumberOfCommittables()) - .hasPendingCommittables(0); - - SinkV2Assertions.assertThat(extractAndAssertCommittableWithLineage(output.get(1))) - .hasSubtaskId(0) - .hasCheckpointId(checkpointId); - } - - table.refresh(); - Snapshot currentSnapshot = table.snapshot(branch); - - assertThat(currentSnapshot.summary()) - .containsEntry(SnapshotSummary.TOTAL_RECORDS_PROP, "1") - .containsEntry(SnapshotSummary.TOTAL_DATA_FILES_PROP, "1"); - } - - /** The data was not committed in the previous job. */ - @TestTemplate - public void testStateRestoreFromPreJobWithUncommitted() throws Exception { - String jobId1 = "jobId1"; - OperatorSubtaskState snapshot; - - // We cannot test a different checkpoint thant 0 because when using the OperatorTestHarness - // for recovery the lastCompleted checkpoint is always reset to 0. - // see: https://github.com/apache/iceberg/issues/10942 - long checkpointId = 0; - long timestamp = 0; - CommittableSummary committableSummary; - - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - preJobTestHarness = getTestHarness()) { - - preJobTestHarness.open(); - - committableSummary = - processElement(jobId1, checkpointId, preJobTestHarness, 1, OPERATOR_ID, dataFileTest1); - - snapshot = preJobTestHarness.snapshot(checkpointId, ++timestamp); - - assertThat(preJobTestHarness.getOutput()).isEmpty(); - } - - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(jobId1, -1L); - - String jobId2 = "jobId2"; - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - restored = getTestHarness()) { - restored.setup(committableMessageTypeSerializer); - restored.initializeState(snapshot); - restored.open(); - - // Previous committables are immediately committed if possible - List output = transformsToStreamElement(restored.getOutput()); - assertThat(output).hasSize(2); - - SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output.get(0))) - .hasFailedCommittables(committableSummary.getNumberOfFailedCommittables()) - .hasOverallCommittables(committableSummary.getNumberOfCommittables()) - .hasPendingCommittables(0); - - SinkV2Assertions.assertThat(extractAndAssertCommittableWithLineage(output.get(1))) - .hasCheckpointId(0L) - .hasSubtaskId(0); - - table.refresh(); - - Snapshot currentSnapshot = table.snapshot(branch); - - assertThat(currentSnapshot.summary()) - .containsEntry(SnapshotSummary.TOTAL_RECORDS_PROP, String.valueOf(dataFIleRowCount)) - .containsEntry(SnapshotSummary.TOTAL_DATA_FILES_PROP, "1") - .containsEntry("flink.job-id", jobId1); - - checkpointId++; - CommittableSummary committableSummary2 = - processElement(jobId2, checkpointId, restored, 1, OPERATOR_ID, dataFileTest2); - - // Trigger commit - restored.notifyOfCompletedCheckpoint(checkpointId); - - List output2 = transformsToStreamElement(restored.getOutput()); - SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output2.get(0))) - .hasFailedCommittables(committableSummary2.getNumberOfFailedCommittables()) - .hasOverallCommittables(committableSummary2.getNumberOfCommittables()) - .hasPendingCommittables(0); - - SinkV2Assertions.assertThat(extractAndAssertCommittableWithLineage(output2.get(1))) - .hasCheckpointId(0L) - .hasSubtaskId(0); - } - - assertSnapshotSize(2); - assertMaxCommittedCheckpointId(jobId2, 1); - - table.refresh(); - Snapshot currentSnapshot2 = table.snapshot(branch); - - assertThat(currentSnapshot2.summary()) - .containsEntry(SnapshotSummary.TOTAL_RECORDS_PROP, String.valueOf(dataFIleRowCount * 2)) - .containsEntry(SnapshotSummary.TOTAL_DATA_FILES_PROP, "2") - .containsEntry("flink.job-id", jobId2); - } - - /** The data was committed in the previous job. */ - @TestTemplate - public void testStateRestoreFromPreJobWithCommitted() throws Exception { - String jobId1 = "jobId1"; - OperatorSubtaskState snapshot; - - // We cannot test a different checkpoint thant 0 because when using the OperatorTestHarness - // for recovery the lastCompleted checkpoint is always reset to 0. - // see: https://github.com/apache/iceberg/issues/10942 - long checkpointId = 0; - - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - preJobTestHarness = getTestHarness()) { - - preJobTestHarness.open(); - - CommittableSummary committableSummary = - processElement(jobId1, checkpointId, preJobTestHarness, 1, OPERATOR_ID, dataFileTest1); - - assertFlinkManifests(1); - snapshot = preJobTestHarness.snapshot(checkpointId, 2L); - // commit snapshot - preJobTestHarness.notifyOfCompletedCheckpoint(checkpointId); - - List output = transformsToStreamElement(preJobTestHarness.getOutput()); - assertThat(output).hasSize(2); - - SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output.get(0))) - .hasFailedCommittables(committableSummary.getNumberOfFailedCommittables()) - .hasOverallCommittables(committableSummary.getNumberOfCommittables()) - .hasPendingCommittables(0); - - assertSnapshotSize(1); - assertMaxCommittedCheckpointId(jobId1, checkpointId); - } - - table.refresh(); - long preJobSnapshotId = table.snapshot(branch).snapshotId(); - - String jobId2 = "jobId2"; - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - restored = getTestHarness()) { - restored.setup(); - restored.initializeState(snapshot); - restored.open(); - - // Makes sure that data committed in the previous job is available in this job - List output2 = transformsToStreamElement(restored.getOutput()); - assertThat(output2).hasSize(2); - - table.refresh(); - long restoredSnapshotId = table.snapshot(branch).snapshotId(); - - assertThat(restoredSnapshotId) - .as("The table does not generate a new snapshot without data being committed.") - .isEqualTo(preJobSnapshotId); - - assertThat(table.snapshot(branch).summary()) - .containsEntry(SnapshotSummary.TOTAL_RECORDS_PROP, String.valueOf(dataFIleRowCount)) - .containsEntry(SnapshotSummary.TOTAL_DATA_FILES_PROP, "1") - .containsEntry("flink.job-id", jobId1); - - // Commit new data file - checkpointId = 1; - CommittableSummary committableSummary2 = - processElement(jobId2, checkpointId, restored, 1, OPERATOR_ID, dataFileTest2); - - // Trigger commit - restored.notifyOfCompletedCheckpoint(checkpointId); - - List output3 = transformsToStreamElement(restored.getOutput()); - assertThat(output3).hasSize(4); - SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output3.get(0))) - .hasFailedCommittables(committableSummary2.getNumberOfFailedCommittables()) - .hasOverallCommittables(committableSummary2.getNumberOfCommittables()) - .hasPendingCommittables(0); - } - - assertSnapshotSize(2); - assertMaxCommittedCheckpointId(jobId2, 1L); - - table.refresh(); - Snapshot currentSnapshot2 = table.snapshot(branch); - assertThat(Long.parseLong(currentSnapshot2.summary().get(SnapshotSummary.TOTAL_RECORDS_PROP))) - .isEqualTo(dataFIleRowCount * 2); - - assertThat(currentSnapshot2.summary()) - .containsEntry(SnapshotSummary.TOTAL_RECORDS_PROP, String.valueOf(dataFIleRowCount * 2)) - .containsEntry(SnapshotSummary.TOTAL_DATA_FILES_PROP, "2") - .containsEntry("flink.job-id", jobId2); - } - - @TestTemplate - public void testStateRestoreFromCurrJob() throws Exception { - String jobId1 = "jobId1"; - CommittableSummary committableSummary; - OperatorSubtaskState snapshot; - - // We cannot test a different checkpoint thant 0 because when using the OperatorTestHarness - // for recovery the lastCompleted checkpoint is always reset to 0. - // see: https://github.com/apache/iceberg/issues/10942 - long checkpointId = 0; - - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - testHarness = getTestHarness()) { - - testHarness.open(); - - committableSummary = - processElement(jobId1, checkpointId, testHarness, 1, OPERATOR_ID, dataFileTest1); - snapshot = testHarness.snapshot(checkpointId, 2L); - - assertThat(testHarness.getOutput()).isEmpty(); - } - - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(jobId1, -1L); - - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - restored = getTestHarness()) { - - restored.setup(committableMessageTypeSerializer); - - restored.initializeState(snapshot); - restored.open(); - - // Previous committables are immediately committed if possible - List output = transformsToStreamElement(restored.getOutput()); - assertThat(output).hasSize(2); - - SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output.get(0))) - .hasFailedCommittables(committableSummary.getNumberOfFailedCommittables()) - .hasOverallCommittables(committableSummary.getNumberOfCommittables()) - .hasPendingCommittables(0); - - table.refresh(); - Snapshot currentSnapshot = table.snapshot(branch); - - assertThat(currentSnapshot.summary()) - .containsEntry(SnapshotSummary.TOTAL_RECORDS_PROP, String.valueOf(dataFIleRowCount)) - .containsEntry(SnapshotSummary.TOTAL_DATA_FILES_PROP, "1") - .containsEntry("flink.job-id", jobId1); - - String jobId2 = "jobId2"; - checkpointId = 1; - CommittableSummary committableSummary2 = - processElement(jobId2, checkpointId, restored, 1, OPERATOR_ID, dataFileTest2); - - // Trigger commit - restored.notifyOfCompletedCheckpoint(checkpointId); - - List output2 = transformsToStreamElement(restored.getOutput()); - SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output2.get(0))) - .hasFailedCommittables(committableSummary2.getNumberOfFailedCommittables()) - .hasOverallCommittables(committableSummary2.getNumberOfCommittables()) - .hasPendingCommittables(0); - restored.close(); - - assertSnapshotSize(2); - assertMaxCommittedCheckpointId(jobId2, 1L); - - table.refresh(); - Snapshot currentSnapshot2 = table.snapshot(branch); - assertThat(currentSnapshot2.summary()) - .containsEntry(SnapshotSummary.TOTAL_RECORDS_PROP, String.valueOf(dataFIleRowCount * 2)) - .containsEntry(SnapshotSummary.TOTAL_DATA_FILES_PROP, "2") - .containsEntry("flink.job-id", jobId2); - } - } - - @TestTemplate - public void testRecoveryFromSnapshotWithoutCompletedNotification() throws Exception { - // We've two steps in checkpoint: 1. snapshotState(ckp); 2. notifyCheckpointComplete(ckp). - // The Flink job should be able to restore from a checkpoint with only step#1 finished. - - // We cannot test a different checkpoint thant 0 because when using the OperatorTestHarness - // for recovery the lastCompleted checkpoint is always reset to 0. - // see: https://github.com/apache/iceberg/issues/10942 - long checkpointId = 0; - long timestamp = 0; - OperatorSubtaskState snapshot; - List expectedRows = Lists.newArrayList(); - - OperatorID operatorId; - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - harness = getTestHarness()) { - - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(jobId, -1L); - - RowData row = SimpleDataUtil.createRowData(1, "hello"); - expectedRows.add(row); - DataFile dataFile = writeDataFile("data-1", ImmutableList.of(row)); - processElement(jobId, checkpointId, harness, 1, operatorId.toString(), dataFile); - - snapshot = harness.snapshot(++checkpointId, ++timestamp); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(), branch); - assertMaxCommittedCheckpointId(jobId, -1L); - assertFlinkManifests(1); - } - - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - harness = getTestHarness()) { - - harness.getStreamConfig().setOperatorID(operatorId); - harness.initializeState(snapshot); - harness.open(); - - // All flink manifests should be cleaned because it has committed the unfinished iceberg - // transaction. - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertMaxCommittedCheckpointId(jobId, operatorId.toString(), 0L); - - harness.snapshot(++checkpointId, ++timestamp); - // Did not write any new record, so it won't generate new manifest. - assertFlinkManifests(0); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertSnapshotSize(1); - - assertMaxCommittedCheckpointId(jobId, operatorId.toString(), 0); - - RowData row = SimpleDataUtil.createRowData(2, "world"); - expectedRows.add(row); - DataFile dataFile = writeDataFile("data-2", ImmutableList.of(row)); - processElement(jobId, checkpointId, harness, 1, operatorId.toString(), dataFile); - - snapshot = harness.snapshot(++checkpointId, ++timestamp); - - assertFlinkManifests(1); - } - - // Redeploying flink job from external checkpoint. - JobID newJobId = new JobID(); - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - harness = getTestHarness()) { - harness.getStreamConfig().setOperatorID(operatorId); - harness.initializeState(snapshot); - harness.open(); - - // test harness has a limitation wherein it is not able to commit pending commits when - // initializeState is called, when the checkpointId > 0 - // so we have to call it explicitly - harness.notifyOfCompletedCheckpoint(checkpointId); - - // All flink manifests should be cleaned because it has committed the unfinished iceberg - // transaction. - assertFlinkManifests(0); - - assertMaxCommittedCheckpointId(newJobId.toString(), operatorId.toString(), -1); - assertMaxCommittedCheckpointId(jobId, operatorId.toString(), 2); - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertSnapshotSize(2); - - RowData row = SimpleDataUtil.createRowData(3, "foo"); - expectedRows.add(row); - DataFile dataFile = writeDataFile("data-3", ImmutableList.of(row)); - processElement( - newJobId.toString(), checkpointId, harness, 1, operatorId.toString(), dataFile); - - harness.snapshot(++checkpointId, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertSnapshotSize(3); - assertMaxCommittedCheckpointId(newJobId.toString(), operatorId.toString(), 3); - } - } - - @TestTemplate - public void testStartAnotherJobToWriteSameTable() throws Exception { - long checkpointId = 1; - long timestamp = 0; - - List rows = Lists.newArrayList(); - List tableRows = Lists.newArrayList(); - - JobID oldJobId = new JobID(); - OperatorID oldOperatorId; - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - harness = getTestHarness()) { - - harness.open(); - oldOperatorId = harness.getOperator().getOperatorID(); - - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(oldJobId.toString(), oldOperatorId.toString(), -1L); - - for (int i = 1; i <= 3; i++) { - rows.add(SimpleDataUtil.createRowData(i, "hello" + i)); - tableRows.addAll(rows); - - DataFile dataFile = writeDataFile(String.format("data-%d", i), rows); - processElement( - oldJobId.toString(), ++checkpointId, harness, 1, oldOperatorId.toString(), dataFile); - harness.snapshot(checkpointId, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId); - - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, tableRows, branch); - assertSnapshotSize(i); - assertMaxCommittedCheckpointId(oldJobId.toString(), oldOperatorId.toString(), checkpointId); - } - } - - // The new started job will start with checkpoint = 1 again. - checkpointId = 1; - JobID newJobId = new JobID(); - OperatorID newOperatorId; - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - harness = getTestHarness()) { - harness.open(); - newOperatorId = harness.getOperator().getOperatorID(); - - assertSnapshotSize(3); - assertMaxCommittedCheckpointId(oldJobId.toString(), oldOperatorId.toString(), 4); - assertMaxCommittedCheckpointId(newJobId.toString(), newOperatorId.toString(), -1); - - rows.add(SimpleDataUtil.createRowData(2, "world")); - tableRows.addAll(rows); - - DataFile dataFile = writeDataFile("data-new-1", rows); - processElement( - newJobId.toString(), checkpointId, harness, 1, newOperatorId.toString(), dataFile); - harness.snapshot(checkpointId, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - SimpleDataUtil.assertTableRows(table, tableRows, branch); - assertSnapshotSize(4); - assertMaxCommittedCheckpointId(newJobId.toString(), newOperatorId.toString(), checkpointId); - } - } - - @TestTemplate - public void testMultipleJobsWriteSameTable() throws Exception { - long timestamp = 0; - List tableRows = Lists.newArrayList(); - - JobID[] jobs = new JobID[] {new JobID(), new JobID(), new JobID()}; - OperatorID[] operatorIds = - new OperatorID[] {new OperatorID(), new OperatorID(), new OperatorID()}; - for (int i = 0; i < 20; i++) { - int jobIndex = i % 3; - int checkpointId = i / 3; - JobID jobID = jobs[jobIndex]; - OperatorID operatorId = operatorIds[jobIndex]; - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - harness = getTestHarness()) { - harness.getStreamConfig().setOperatorID(operatorId); - - harness.open(); - - assertSnapshotSize(i); - assertMaxCommittedCheckpointId( - jobID.toString(), operatorId.toString(), checkpointId == 0 ? -1 : checkpointId - 1); - - List rows = Lists.newArrayList(SimpleDataUtil.createRowData(i, "word-" + i)); - tableRows.addAll(rows); - - DataFile dataFile = writeDataFile(String.format("data-%d", i), rows); - - processElement(jobID.toString(), checkpointId, harness, 1, operatorId.toString(), dataFile); - - harness.snapshot(checkpointId, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - SimpleDataUtil.assertTableRows(table, tableRows, branch); - assertSnapshotSize(i + 1); - assertMaxCommittedCheckpointId(jobID.toString(), operatorId.toString(), checkpointId); - } - } - } - - @TestTemplate - public void testMultipleSinksRecoveryFromValidSnapshot() throws Exception { - - // We cannot test a different checkpoint thant 0 because when using the OperatorTestHarness - // for recovery the lastCompleted checkpoint is always reset to 0. - // see: https://github.com/apache/iceberg/issues/10942 - long checkpointId = 0; - long timestamp = 0; - List expectedRows = Lists.newArrayList(); - OperatorSubtaskState snapshot1; - OperatorSubtaskState snapshot2; - - JobID jobID = new JobID(); - OperatorID operatorId1 = new OperatorID(); - OperatorID operatorId2 = new OperatorID(); - - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - harness1 = getTestHarness()) { - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - harness2 = getTestHarness()) { - harness1.getStreamConfig().setOperatorID(operatorId1); - harness1.setup(); - harness1.open(); - harness2.getStreamConfig().setOperatorID(operatorId2); - harness2.setup(); - harness2.open(); - - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(jobID.toString(), operatorId1.toString(), -1L); - assertMaxCommittedCheckpointId(jobID.toString(), operatorId2.toString(), -1L); - - RowData row1 = SimpleDataUtil.createRowData(1, "hello1"); - expectedRows.add(row1); - DataFile dataFile1 = writeDataFile("data-1-1", ImmutableList.of(row1)); - processElement( - jobID.toString(), checkpointId, harness1, 1, operatorId1.toString(), dataFile1); - - snapshot1 = harness1.snapshot(checkpointId, ++timestamp); - - RowData row2 = SimpleDataUtil.createRowData(1, "hello2"); - expectedRows.add(row2); - DataFile dataFile2 = writeDataFile("data-1-2", ImmutableList.of(row2)); - processElement( - jobID.toString(), checkpointId, harness2, 1, operatorId2.toString(), dataFile2); - - snapshot2 = harness2.snapshot(checkpointId, ++timestamp); - assertFlinkManifests(2); - - // Only notify one of the committers - harness1.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(1); - - // Only the first row is committed at this point - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); - assertSnapshotSize(1); - assertMaxCommittedCheckpointId(jobID.toString(), operatorId1.toString(), checkpointId); - assertMaxCommittedCheckpointId(jobID.toString(), operatorId2.toString(), -1); - } - } - - // Restore from the given snapshot - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - harness1 = getTestHarness(); - OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - harness2 = getTestHarness()) { - harness1.getStreamConfig().setOperatorID(operatorId1); - harness1.setup(); - harness1.initializeState(snapshot1); - harness1.open(); - - harness2.getStreamConfig().setOperatorID(operatorId2); - harness2.setup(); - harness2.initializeState(snapshot2); - harness2.open(); - - // All flink manifests should be cleaned because it has committed the unfinished iceberg - // transaction. - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertSnapshotSize(2); - assertMaxCommittedCheckpointId(jobID.toString(), operatorId1.toString(), checkpointId); - assertMaxCommittedCheckpointId(jobID.toString(), operatorId2.toString(), checkpointId); - - RowData row1 = SimpleDataUtil.createRowData(2, "world1"); - expectedRows.add(row1); - DataFile dataFile1 = writeDataFile("data-2-1", ImmutableList.of(row1)); - - checkpointId++; - processElement( - jobID.toString(), checkpointId, harness1, 1, operatorId1.toString(), dataFile1); - - harness1.snapshot(checkpointId, ++timestamp); - - RowData row2 = SimpleDataUtil.createRowData(2, "world2"); - expectedRows.add(row2); - DataFile dataFile2 = writeDataFile("data-2-2", ImmutableList.of(row2)); - processElement( - jobID.toString(), checkpointId, harness2, 1, operatorId2.toString(), dataFile2); - - harness2.snapshot(checkpointId, ++timestamp); - - assertFlinkManifests(2); - - harness1.notifyOfCompletedCheckpoint(checkpointId); - harness2.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertSnapshotSize(4); - assertMaxCommittedCheckpointId(jobID.toString(), operatorId1.toString(), checkpointId); - assertMaxCommittedCheckpointId(jobID.toString(), operatorId2.toString(), checkpointId); - } - } - - @TestTemplate - public void testFlinkManifests() throws Exception { - long timestamp = 0; - long checkpoint = 1; - - JobID jobID = new JobID(); - OperatorID operatorId; - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - harness = getTestHarness()) { - - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertMaxCommittedCheckpointId(jobID.toString(), operatorId.toString(), -1L); - - RowData row1 = SimpleDataUtil.createRowData(1, "hello"); - DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); - - // harness.processElement(of(dataFile1), ++timestamp); - processElement(jobID.toString(), checkpoint, harness, 1, operatorId.toString(), dataFile1); - - assertMaxCommittedCheckpointId(jobID.toString(), operatorId.toString(), -1L); - - // 1. snapshotState for checkpoint#1 - harness.snapshot(checkpoint, ++timestamp); - List manifestPaths = assertFlinkManifests(1); - Path manifestPath = manifestPaths.get(0); - assertThat(manifestPath.getFileName()) - .asString() - .isEqualTo( - String.format("%s-%s-%05d-%d-%d-%05d.avro", jobID, operatorId, 0, 0, checkpoint, 1)); - // - // 2. Read the data files from manifests and assert. - List dataFiles = - FlinkManifestUtil.readDataFiles( - createTestingManifestFile(manifestPath, dataFile1), table.io(), table.specs()); - assertThat(dataFiles).hasSize(1); - TestHelpers.assertEquals(dataFile1, dataFiles.get(0)); - - // 3. notifyCheckpointComplete for checkpoint#1 - harness.notifyOfCompletedCheckpoint(checkpoint); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); - assertMaxCommittedCheckpointId(jobID.toString(), operatorId.toString(), checkpoint); - assertFlinkManifests(0); - } - } - - @TestTemplate - public void testHandleEndInput() throws Exception { - assumeThat(isStreamingMode).as("Only support batch mode").isFalse(); - - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - testHarness = getTestHarness()) { - - testHarness.open(); - - long checkpointId = Long.MAX_VALUE; - processElement(jobId, checkpointId, testHarness, 1, OPERATOR_ID, dataFileTest1); - - testHarness.endInput(); - - assertMaxCommittedCheckpointId(jobId, OPERATOR_ID, Long.MAX_VALUE); - - List output = transformsToStreamElement(testHarness.getOutput()); - assertThat(output).hasSize(2); - - SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output.get(0))) - .hasCheckpointId(checkpointId) - .hasPendingCommittables(0) - .hasOverallCommittables(1) - .hasFailedCommittables(0); - - // endInput is idempotent - testHarness.endInput(); - assertThat(testHarness.getOutput()).hasSize(2); - } - } - - @TestTemplate - public void testDeleteFiles() throws Exception { - - assumeThat(formatVersion).as("Only support delete in format v2").isGreaterThanOrEqualTo(2); - - FileAppenderFactory appenderFactory = createDeletableAppenderFactory(); - - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - testHarness = getTestHarness()) { - - testHarness.open(); - - long checkpointId = 1; - RowData row1 = SimpleDataUtil.createInsert(1, "aaa"); - DataFile dataFile1 = writeDataFile("data-file-1", ImmutableList.of(row1)); - processElement(jobId, checkpointId, testHarness, 1, OPERATOR_ID, dataFile1); - - // testHarness.snapshot(checkpointId, 0); - testHarness.notifyOfCompletedCheckpoint(checkpointId); - - assertSnapshotSize(1); - assertMaxCommittedCheckpointId(jobId, checkpointId); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); - - List output = transformsToStreamElement(testHarness.getOutput()); - assertThat(output).hasSize(2); - SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output.get(0))) - .hasCheckpointId(checkpointId) - .hasPendingCommittables(0) - .hasOverallCommittables(1) - .hasFailedCommittables(0); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); - - // The 2. commit - checkpointId = 2; - RowData row2 = SimpleDataUtil.createInsert(2, "bbb"); - DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(row2)); - - RowData row3 = SimpleDataUtil.createInsert(3, "ccc"); - DataFile dataFile3 = writeDataFile("data-file-3", ImmutableList.of(row3)); - processElement(jobId, checkpointId, testHarness, 2, OPERATOR_ID, dataFile2, dataFile3); - - // testHarness.snapshot(checkpointId, 1); - testHarness.notifyOfCompletedCheckpoint(checkpointId); - - assertSnapshotSize(2); - assertMaxCommittedCheckpointId(jobId, checkpointId); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2, row3), branch); - - List output2 = transformsToStreamElement(testHarness.getOutput()); - assertThat(output2).hasSize(2 + 2); - SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output2.get(2))) - .hasCheckpointId(checkpointId) - .hasPendingCommittables(0) - .hasOverallCommittables(1) - .hasFailedCommittables(0); - - // The 3. commit - checkpointId = 3; - RowData delete1 = SimpleDataUtil.createDelete(1, "aaa"); - DeleteFile deleteFile1 = - writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete1)); - RowData row4 = SimpleDataUtil.createInsert(4, "ddd"); - DataFile dataFile4 = writeDataFile("data-file-4", ImmutableList.of(row4)); - - RowData row5 = SimpleDataUtil.createInsert(5, "eee"); - DataFile dataFile5 = writeDataFile("data-file-5", ImmutableList.of(row5)); - WriteResult withRecord4 = - WriteResult.builder() - .addDataFiles(dataFile4, dataFile5) - .addDeleteFiles(deleteFile1) - .build(); - processElement(withRecord4, jobId, checkpointId, testHarness, 2, OPERATOR_ID); - - // testHarness.snapshot(checkpointId, 3); - testHarness.notifyOfCompletedCheckpoint(checkpointId); - - assertSnapshotSize(3); - assertMaxCommittedCheckpointId(jobId, checkpointId); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row2, row3, row4, row5), branch); - - List output3 = transformsToStreamElement(testHarness.getOutput()); - assertThat(output3).hasSize(2 + 2 + 2); - SinkV2Assertions.assertThat(extractAndAssertCommittableSummary(output3.get(4))) - .hasCheckpointId(checkpointId) - .hasPendingCommittables(0) - .hasOverallCommittables(1) - .hasFailedCommittables(0); - } - } - - private ManifestFile createTestingManifestFile(Path manifestPath, DataFile dataFile) - throws IOException { - ManifestWriter writer = - ManifestFiles.write( - formatVersion, - PartitionSpec.unpartitioned(), - table.io().newOutputFile(manifestPath.toString()), - 0L); - writer.add(dataFile); - writer.close(); - return writer.toManifestFile(); - } - - private IcebergWriteAggregator buildIcebergWriteAggregator(String myJobId, String operatorId) { - IcebergWriteAggregator icebergWriteAggregator = spy(new IcebergWriteAggregator(tableLoader)); - StreamTask ctx = mock(StreamTask.class); - Environment env = mock(Environment.class); - StreamingRuntimeContext streamingRuntimeContext = mock(StreamingRuntimeContext.class); - TaskInfo taskInfo = mock(TaskInfo.class); - JobID myJobID = mock(JobID.class); - OperatorID operatorID = mock(OperatorID.class); - doReturn(myJobId).when(myJobID).toString(); - doReturn(myJobID).when(env).getJobID(); - doReturn(env).when(ctx).getEnvironment(); - doReturn(ctx).when(icebergWriteAggregator).getContainingTask(); - doReturn(operatorId).when(operatorID).toString(); - doReturn(operatorID).when(icebergWriteAggregator).getOperatorID(); - doReturn(0).when(taskInfo).getAttemptNumber(); - doReturn(taskInfo).when(streamingRuntimeContext).getTaskInfo(); - doReturn(streamingRuntimeContext).when(icebergWriteAggregator).getRuntimeContext(); - - try { - icebergWriteAggregator.open(); - } catch (Exception e) { - throw new RuntimeException(e); - } - return icebergWriteAggregator; - } - - private CommittableSummary processElement( - WriteResult withRecord, - String myJobId, - long checkpointId, - OneInputStreamOperatorTestHarness testHarness, - int subTaskId, - String operatorId) - throws Exception { - - IcebergCommittable commit = - new IcebergCommittable( - buildIcebergWriteAggregator(myJobId, operatorId) - .writeToManifest(Lists.newArrayList(withRecord), checkpointId), - myJobId, - operatorId, - checkpointId); - - CommittableSummary committableSummary = - new CommittableSummary<>(subTaskId, 1, checkpointId, 1, 1, 0); - testHarness.processElement(new StreamRecord<>(committableSummary)); - - CommittableWithLineage committable = - new CommittableWithLineage<>(commit, checkpointId, subTaskId); - testHarness.processElement(new StreamRecord<>(committable)); - - return committableSummary; - } - - private CommittableSummary processElement( - String myJobID, - long checkpointId, - OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - testHarness, - int subTaskId, - String operatorId, - DataFile... dataFile) - throws Exception { - WriteResult withRecord = WriteResult.builder().addDataFiles(dataFile).build(); - return processElement(withRecord, myJobID, checkpointId, testHarness, subTaskId, operatorId); - } - - private FileAppenderFactory createDeletableAppenderFactory() { - int[] equalityFieldIds = - new int[] { - table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId() - }; - return new FlinkAppenderFactory( - table, - table.schema(), - FlinkSchemaUtil.convert(table.schema()), - table.properties(), - table.spec(), - equalityFieldIds, - table.schema(), - null); - } - - private List assertFlinkManifests(int expectedCount) throws IOException { - List manifests = - Files.list(flinkManifestFolder.toPath()) - .filter(p -> !p.toString().endsWith(".crc")) - .collect(Collectors.toList()); - assertThat(manifests).hasSize(expectedCount); - return manifests; - } - - private DataFile writeDataFile(String filename, List rows) throws IOException { - return SimpleDataUtil.writeFile( - table, - table.schema(), - table.spec(), - new Configuration(), - table.location(), - FileFormat.PARQUET.addExtension(filename), - rows); - } - - private DeleteFile writeEqDeleteFile( - FileAppenderFactory appenderFactory, String filename, List deletes) - throws IOException { - return SimpleDataUtil.writeEqDeleteFile( - table, FileFormat.PARQUET, filename, appenderFactory, deletes); - } - - private OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - getTestHarness() throws Exception { - IcebergSink sink = - IcebergSink.forRowData(null).table(table).toBranch(branch).tableLoader(tableLoader).build(); - - OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - testHarness = - new OneInputStreamOperatorTestHarness<>( - new CommitterOperatorFactory<>(sink, !isStreamingMode, true)); - testHarness.setup(committableMessageTypeSerializer); - return testHarness; - } - - // ------------------------------- Utility Methods -------------------------------- - - private IcebergCommitter getCommitter() { - IcebergFilesCommitterMetrics metric = mock(IcebergFilesCommitterMetrics.class); - return new IcebergCommitter( - tableLoader, - branch, - Collections.singletonMap("flink.test", TestIcebergCommitter.class.getName()), - false, - 10, - "sinkId", - metric, - false); - } - - private Committer.CommitRequest buildCommitRequestFor( - String myJobID, long checkpoint, Collection writeResults) throws IOException { - IcebergCommittable commit = - new IcebergCommittable( - buildIcebergWriteAggregator(myJobID, OPERATOR_ID) - .writeToManifest(writeResults, checkpoint), - myJobID, - OPERATOR_ID, - checkpoint); - - CommittableWithLineage committableWithLineage = - new CommittableWithLineage(commit, checkpoint, 1); - Committer.CommitRequest commitRequest = mock(Committer.CommitRequest.class); - - doReturn(committableWithLineage.getCommittable()).when(commitRequest).getCommittable(); - - return commitRequest; - } - - private WriteResult of(DataFile dataFile) { - return WriteResult.builder().addDataFiles(dataFile).build(); - } - - private void assertMaxCommittedCheckpointId(String myJobID, String operatorId, long expectedId) { - table.refresh(); - long actualId = SinkUtil.getMaxCommittedCheckpointId(table, myJobID, operatorId, branch); - assertThat(actualId).isEqualTo(expectedId); - } - - private void assertMaxCommittedCheckpointId(String myJobID, long expectedId) { - assertMaxCommittedCheckpointId(myJobID, OPERATOR_ID, expectedId); - } - - private void assertSnapshotSize(int expectedSnapshotSize) { - table.refresh(); - assertThat(table.snapshots()).hasSize(expectedSnapshotSize); - } - - private static ByteBuffer longToBuffer(long value) { - return ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN).putLong(0, value); - } - - private static class TestCommittableMessageTypeSerializer - extends TypeSerializer> { - - CommittableMessageSerializer serializer = - new CommittableMessageSerializer<>(new IcebergCommittableSerializer()); - - @Override - public boolean isImmutableType() { - return false; - } - - @Override - public TypeSerializer> duplicate() { - return null; - } - - @Override - public CommittableMessage createInstance() { - return null; - } - - @Override - public CommittableMessage copy( - CommittableMessage from) { - return from; - } - - @Override - public CommittableMessage copy( - CommittableMessage from, CommittableMessage reuse) { - return from; - } - - @Override - public int getLength() { - return 0; - } - - @Override - public void serialize(CommittableMessage record, DataOutputView target) - throws IOException { - byte[] serialize = serializer.serialize(record); - target.writeInt(serialize.length); - target.write(serialize); - } - - @Override - public CommittableMessage deserialize(DataInputView source) - throws IOException { - int length = source.readInt(); - byte[] bytes = new byte[length]; - source.read(bytes); - return serializer.deserialize(1, bytes); - } - - @Override - public CommittableMessage deserialize( - CommittableMessage reuse, DataInputView source) throws IOException { - return deserialize(source); - } - - @Override - public void copy(DataInputView source, DataOutputView target) throws IOException { - CommittableMessage deserialize = deserialize(source); - serialize(deserialize, target); - } - - @Override - public boolean equals(Object obj) { - return false; - } - - @Override - public int hashCode() { - return 0; - } - - @Override - public TypeSerializerSnapshot> snapshotConfiguration() { - return null; - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java deleted file mode 100644 index bc6e00b03d9b..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java +++ /dev/null @@ -1,1238 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.flink.sink.IcebergFilesCommitter.MAX_CONTINUOUS_EMPTY_COMMITS; -import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.NavigableMap; -import java.util.SortedMap; -import java.util.stream.Collectors; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.common.JobID; -import org.apache.flink.api.common.state.ListState; -import org.apache.flink.api.common.state.OperatorStateStore; -import org.apache.flink.core.io.SimpleVersionedSerialization; -import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; -import org.apache.flink.runtime.jobgraph.OperatorID; -import org.apache.flink.runtime.operators.testutils.MockEnvironment; -import org.apache.flink.runtime.operators.testutils.MockEnvironmentBuilder; -import org.apache.flink.runtime.operators.testutils.MockInputSplitProvider; -import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; -import org.apache.flink.streaming.api.operators.BoundedOneInput; -import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; -import org.apache.flink.streaming.api.operators.StreamOperator; -import org.apache.flink.streaming.api.operators.StreamOperatorParameters; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.table.data.RowData; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.ManifestWriter; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionData; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.TestBase; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.TestTableLoader; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.Pair; -import org.apache.iceberg.util.ThreadPools; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestIcebergFilesCommitter extends TestBase { - private static final Configuration CONF = new Configuration(); - - private File flinkManifestFolder; - - @Parameter(index = 1) - private FileFormat format; - - @Parameter(index = 2) - private String branch; - - @Parameters(name = "formatVersion = {0}, fileFormat = {1}, branch = {2}") - protected static List parameters() { - return Arrays.asList( - new Object[] {1, FileFormat.AVRO, "main"}, - new Object[] {2, FileFormat.AVRO, "test-branch"}, - new Object[] {1, FileFormat.PARQUET, "main"}, - new Object[] {2, FileFormat.PARQUET, "test-branch"}, - new Object[] {1, FileFormat.ORC, "main"}, - new Object[] {2, FileFormat.ORC, "test-branch"}); - } - - @Override - @BeforeEach - public void setupTable() throws IOException { - flinkManifestFolder = Files.createTempDirectory(temp, "flink").toFile(); - this.metadataDir = new File(tableDir, "metadata"); - - // Construct the iceberg table. - table = create(SimpleDataUtil.SCHEMA, PartitionSpec.unpartitioned()); - - table - .updateProperties() - .set(DEFAULT_FILE_FORMAT, format.name()) - .set(FLINK_MANIFEST_LOCATION, flinkManifestFolder.getAbsolutePath()) - .set(MAX_CONTINUOUS_EMPTY_COMMITS, "1") - .commit(); - } - - @TestTemplate - public void testCommitTxnWithoutDataFiles() throws Exception { - long checkpointId = 0; - long timestamp = 0; - JobID jobId = new JobID(); - OperatorID operatorId; - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - SimpleDataUtil.assertTableRows(table, Lists.newArrayList(), branch); - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - // It's better to advance the max-committed-checkpoint-id in iceberg snapshot, so that the - // future flink job failover won't fail. - for (int i = 1; i <= 3; i++) { - harness.snapshot(++checkpointId, ++timestamp); - assertFlinkManifests(0); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - assertSnapshotSize(i); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - } - } - } - - @TestTemplate - public void testMaxContinuousEmptyCommits() throws Exception { - table.updateProperties().set(MAX_CONTINUOUS_EMPTY_COMMITS, "3").commit(); - - JobID jobId = new JobID(); - long checkpointId = 0; - long timestamp = 0; - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(jobId)) { - harness.setup(); - harness.open(); - - assertSnapshotSize(0); - - for (int i = 1; i <= 9; i++) { - harness.snapshot(++checkpointId, ++timestamp); - harness.notifyOfCompletedCheckpoint(checkpointId); - - assertSnapshotSize(i / 3); - } - } - } - - private FlinkWriteResult of(long checkpointId, DataFile dataFile) { - return new FlinkWriteResult(checkpointId, WriteResult.builder().addDataFiles(dataFile).build()); - } - - @TestTemplate - public void testCommitTxn() throws Exception { - // Test with 3 continues checkpoints: - // 1. snapshotState for checkpoint#1 - // 2. notifyCheckpointComplete for checkpoint#1 - // 3. snapshotState for checkpoint#2 - // 4. notifyCheckpointComplete for checkpoint#2 - // 5. snapshotState for checkpoint#3 - // 6. notifyCheckpointComplete for checkpoint#3 - long timestamp = 0; - - JobID jobID = new JobID(); - OperatorID operatorId; - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(jobID)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertSnapshotSize(0); - - List rows = Lists.newArrayListWithExpectedSize(3); - for (int i = 1; i <= 3; i++) { - RowData rowData = SimpleDataUtil.createRowData(i, "hello" + i); - DataFile dataFile = writeDataFile("data-" + i, ImmutableList.of(rowData)); - harness.processElement(of(i, dataFile), ++timestamp); - rows.add(rowData); - - harness.snapshot(i, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(i); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, ImmutableList.copyOf(rows), branch); - assertSnapshotSize(i); - assertMaxCommittedCheckpointId(jobID, operatorId, i); - assertThat(SimpleDataUtil.latestSnapshot(table, branch).summary()) - .containsEntry("flink.test", TestIcebergFilesCommitter.class.getName()); - } - } - } - - @TestTemplate - public void testOrderedEventsBetweenCheckpoints() throws Exception { - // It's possible that two checkpoints happen in the following orders: - // 1. snapshotState for checkpoint#1; - // 2. snapshotState for checkpoint#2; - // 3. notifyCheckpointComplete for checkpoint#1; - // 4. notifyCheckpointComplete for checkpoint#2; - long timestamp = 0; - - JobID jobId = new JobID(); - OperatorID operatorId; - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - RowData row1 = SimpleDataUtil.createRowData(1, "hello"); - DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); - - long firstCheckpointId = 1; - harness.processElement(of(firstCheckpointId, dataFile1), ++timestamp); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - // 1. snapshotState for checkpoint#1 - harness.snapshot(firstCheckpointId, ++timestamp); - assertFlinkManifests(1); - - RowData row2 = SimpleDataUtil.createRowData(2, "world"); - DataFile dataFile2 = writeDataFile("data-2", ImmutableList.of(row2)); - long secondCheckpointId = 2; - harness.processElement(of(secondCheckpointId, dataFile2), ++timestamp); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - // 2. snapshotState for checkpoint#2 - harness.snapshot(secondCheckpointId, ++timestamp); - assertFlinkManifests(2); - - // 3. notifyCheckpointComplete for checkpoint#1 - harness.notifyOfCompletedCheckpoint(firstCheckpointId); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); - assertMaxCommittedCheckpointId(jobId, operatorId, firstCheckpointId); - assertFlinkManifests(1); - - // 4. notifyCheckpointComplete for checkpoint#2 - harness.notifyOfCompletedCheckpoint(secondCheckpointId); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2), branch); - assertMaxCommittedCheckpointId(jobId, operatorId, secondCheckpointId); - assertFlinkManifests(0); - } - } - - @TestTemplate - public void testDisorderedEventsBetweenCheckpoints() throws Exception { - // It's possible that the two checkpoints happen in the following orders: - // 1. snapshotState for checkpoint#1; - // 2. snapshotState for checkpoint#2; - // 3. notifyCheckpointComplete for checkpoint#2; - // 4. notifyCheckpointComplete for checkpoint#1; - long timestamp = 0; - - JobID jobId = new JobID(); - OperatorID operatorId; - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - RowData row1 = SimpleDataUtil.createRowData(1, "hello"); - DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); - - long firstCheckpointId = 1; - harness.processElement(of(firstCheckpointId, dataFile1), ++timestamp); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - // 1. snapshotState for checkpoint#1 - harness.snapshot(firstCheckpointId, ++timestamp); - assertFlinkManifests(1); - - RowData row2 = SimpleDataUtil.createRowData(2, "world"); - DataFile dataFile2 = writeDataFile("data-2", ImmutableList.of(row2)); - long secondCheckpointId = 2; - harness.processElement(of(secondCheckpointId, dataFile2), ++timestamp); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - // 2. snapshotState for checkpoint#2 - harness.snapshot(secondCheckpointId, ++timestamp); - assertFlinkManifests(2); - - // 3. notifyCheckpointComplete for checkpoint#2 - harness.notifyOfCompletedCheckpoint(secondCheckpointId); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2), branch); - assertMaxCommittedCheckpointId(jobId, operatorId, secondCheckpointId); - assertFlinkManifests(0); - - // 4. notifyCheckpointComplete for checkpoint#1 - harness.notifyOfCompletedCheckpoint(firstCheckpointId); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2), branch); - assertMaxCommittedCheckpointId(jobId, operatorId, secondCheckpointId); - assertFlinkManifests(0); - } - } - - @TestTemplate - public void testRecoveryFromValidSnapshot() throws Exception { - long checkpointId = 0; - long timestamp = 0; - List expectedRows = Lists.newArrayList(); - OperatorSubtaskState snapshot; - - JobID jobId = new JobID(); - OperatorID operatorId; - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - RowData row = SimpleDataUtil.createRowData(1, "hello"); - expectedRows.add(row); - DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row)); - - harness.processElement(of(++checkpointId, dataFile1), ++timestamp); - snapshot = harness.snapshot(checkpointId, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row), branch); - assertSnapshotSize(1); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - } - - // Restore from the given snapshot - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(jobId)) { - harness.getStreamConfig().setOperatorID(operatorId); - harness.setup(); - harness.initializeState(snapshot); - harness.open(); - - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertSnapshotSize(1); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - - RowData row = SimpleDataUtil.createRowData(2, "world"); - expectedRows.add(row); - DataFile dataFile = writeDataFile("data-2", ImmutableList.of(row)); - harness.processElement(of(++checkpointId, dataFile), ++timestamp); - - harness.snapshot(checkpointId, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertSnapshotSize(2); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - } - } - - @TestTemplate - public void testRecoveryFromSnapshotWithoutCompletedNotification() throws Exception { - // We've two steps in checkpoint: 1. snapshotState(ckp); 2. notifyCheckpointComplete(ckp). It's - // possible that we - // flink job will restore from a checkpoint with only step#1 finished. - long checkpointId = 0; - long timestamp = 0; - OperatorSubtaskState snapshot; - List expectedRows = Lists.newArrayList(); - JobID jobId = new JobID(); - OperatorID operatorId; - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - RowData row = SimpleDataUtil.createRowData(1, "hello"); - expectedRows.add(row); - DataFile dataFile = writeDataFile("data-1", ImmutableList.of(row)); - harness.processElement(of(++checkpointId, dataFile), ++timestamp); - - snapshot = harness.snapshot(checkpointId, ++timestamp); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(), branch); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - assertFlinkManifests(1); - } - - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(jobId)) { - harness.getStreamConfig().setOperatorID(operatorId); - harness.setup(); - harness.initializeState(snapshot); - harness.open(); - - // All flink manifests should be cleaned because it has committed the unfinished iceberg - // transaction. - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - - harness.snapshot(++checkpointId, ++timestamp); - // Did not write any new record, so it won't generate new manifest. - assertFlinkManifests(0); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertSnapshotSize(2); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - - RowData row = SimpleDataUtil.createRowData(2, "world"); - expectedRows.add(row); - DataFile dataFile = writeDataFile("data-2", ImmutableList.of(row)); - harness.processElement(of(++checkpointId, dataFile), ++timestamp); - - snapshot = harness.snapshot(checkpointId, ++timestamp); - assertFlinkManifests(1); - } - - // Redeploying flink job from external checkpoint. - JobID newJobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(newJobId)) { - harness.setup(); - harness.initializeState(snapshot); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - // All flink manifests should be cleaned because it has committed the unfinished iceberg - // transaction. - assertFlinkManifests(0); - - assertMaxCommittedCheckpointId(newJobId, operatorId, -1); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertSnapshotSize(3); - - RowData row = SimpleDataUtil.createRowData(3, "foo"); - expectedRows.add(row); - DataFile dataFile = writeDataFile("data-3", ImmutableList.of(row)); - harness.processElement(of(++checkpointId, dataFile), ++timestamp); - - harness.snapshot(checkpointId, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertSnapshotSize(4); - assertMaxCommittedCheckpointId(newJobId, operatorId, checkpointId); - } - } - - @TestTemplate - public void testStartAnotherJobToWriteSameTable() throws Exception { - long checkpointId = 0; - long timestamp = 0; - List rows = Lists.newArrayList(); - List tableRows = Lists.newArrayList(); - - JobID oldJobId = new JobID(); - OperatorID oldOperatorId; - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(oldJobId)) { - harness.setup(); - harness.open(); - oldOperatorId = harness.getOperator().getOperatorID(); - - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(oldJobId, oldOperatorId, -1L); - - for (int i = 1; i <= 3; i++) { - rows.add(SimpleDataUtil.createRowData(i, "hello" + i)); - tableRows.addAll(rows); - - DataFile dataFile = writeDataFile(String.format("data-%d", i), rows); - harness.processElement(of(++checkpointId, dataFile), ++timestamp); - harness.snapshot(checkpointId, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, tableRows, branch); - assertSnapshotSize(i); - assertMaxCommittedCheckpointId(oldJobId, oldOperatorId, checkpointId); - } - } - - // The new started job will start with checkpoint = 1 again. - checkpointId = 0; - timestamp = 0; - JobID newJobId = new JobID(); - OperatorID newOperatorId; - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(newJobId)) { - harness.setup(); - harness.open(); - newOperatorId = harness.getOperator().getOperatorID(); - - assertSnapshotSize(3); - assertMaxCommittedCheckpointId(oldJobId, oldOperatorId, 3); - assertMaxCommittedCheckpointId(newJobId, newOperatorId, -1); - - rows.add(SimpleDataUtil.createRowData(2, "world")); - tableRows.addAll(rows); - - DataFile dataFile = writeDataFile("data-new-1", rows); - harness.processElement(of(++checkpointId, dataFile), ++timestamp); - harness.snapshot(checkpointId, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - SimpleDataUtil.assertTableRows(table, tableRows, branch); - assertSnapshotSize(4); - assertMaxCommittedCheckpointId(newJobId, newOperatorId, checkpointId); - } - } - - @TestTemplate - public void testMultipleJobsWriteSameTable() throws Exception { - long timestamp = 0; - List tableRows = Lists.newArrayList(); - - JobID[] jobs = new JobID[] {new JobID(), new JobID(), new JobID()}; - OperatorID[] operatorIds = - new OperatorID[] {new OperatorID(), new OperatorID(), new OperatorID()}; - for (int i = 0; i < 20; i++) { - int jobIndex = i % 3; - int checkpointId = i / 3; - JobID jobId = jobs[jobIndex]; - OperatorID operatorId = operatorIds[jobIndex]; - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(jobId)) { - harness.getStreamConfig().setOperatorID(operatorId); - harness.setup(); - harness.open(); - - assertSnapshotSize(i); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId == 0 ? -1 : checkpointId); - - List rows = Lists.newArrayList(SimpleDataUtil.createRowData(i, "word-" + i)); - tableRows.addAll(rows); - - DataFile dataFile = writeDataFile(String.format("data-%d", i), rows); - harness.processElement(of(checkpointId + 1, dataFile), ++timestamp); - harness.snapshot(checkpointId + 1, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId + 1); - assertFlinkManifests(0); - SimpleDataUtil.assertTableRows(table, tableRows, branch); - assertSnapshotSize(i + 1); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId + 1); - } - } - } - - @TestTemplate - public void testMultipleSinksRecoveryFromValidSnapshot() throws Exception { - long checkpointId = 0; - long timestamp = 0; - List expectedRows = Lists.newArrayList(); - OperatorSubtaskState snapshot1; - OperatorSubtaskState snapshot2; - - JobID jobId = new JobID(); - OperatorID operatorId1 = new OperatorID(); - OperatorID operatorId2 = new OperatorID(); - try (OneInputStreamOperatorTestHarness harness1 = - createStreamSink(jobId); - OneInputStreamOperatorTestHarness harness2 = - createStreamSink(jobId)) { - harness1.getStreamConfig().setOperatorID(operatorId1); - harness1.setup(); - harness1.open(); - harness2.getStreamConfig().setOperatorID(operatorId2); - harness2.setup(); - harness2.open(); - - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(jobId, operatorId1, -1L); - assertMaxCommittedCheckpointId(jobId, operatorId2, -1L); - - RowData row1 = SimpleDataUtil.createRowData(1, "hello1"); - expectedRows.add(row1); - DataFile dataFile1 = writeDataFile("data-1-1", ImmutableList.of(row1)); - - harness1.processElement(of(++checkpointId, dataFile1), ++timestamp); - snapshot1 = harness1.snapshot(checkpointId, ++timestamp); - - RowData row2 = SimpleDataUtil.createRowData(1, "hello2"); - expectedRows.add(row2); - DataFile dataFile2 = writeDataFile("data-1-2", ImmutableList.of(row2)); - - harness2.processElement(of(checkpointId, dataFile2), ++timestamp); - snapshot2 = harness2.snapshot(checkpointId, ++timestamp); - assertFlinkManifests(2); - - // Only notify one of the committers - harness1.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(1); - - // Only the first row is committed at this point - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); - assertSnapshotSize(1); - assertMaxCommittedCheckpointId(jobId, operatorId1, checkpointId); - assertMaxCommittedCheckpointId(jobId, operatorId2, -1); - } - - // Restore from the given snapshot - try (OneInputStreamOperatorTestHarness harness1 = - createStreamSink(jobId); - OneInputStreamOperatorTestHarness harness2 = - createStreamSink(jobId)) { - harness1.getStreamConfig().setOperatorID(operatorId1); - harness1.setup(); - harness1.initializeState(snapshot1); - harness1.open(); - - harness2.getStreamConfig().setOperatorID(operatorId2); - harness2.setup(); - harness2.initializeState(snapshot2); - harness2.open(); - - // All flink manifests should be cleaned because it has committed the unfinished iceberg - // transaction. - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertSnapshotSize(2); - assertMaxCommittedCheckpointId(jobId, operatorId1, checkpointId); - assertMaxCommittedCheckpointId(jobId, operatorId2, checkpointId); - - RowData row1 = SimpleDataUtil.createRowData(2, "world1"); - expectedRows.add(row1); - DataFile dataFile1 = writeDataFile("data-2-1", ImmutableList.of(row1)); - - harness1.processElement(of(++checkpointId, dataFile1), ++timestamp); - harness1.snapshot(checkpointId, ++timestamp); - - RowData row2 = SimpleDataUtil.createRowData(2, "world2"); - expectedRows.add(row2); - DataFile dataFile2 = writeDataFile("data-2-2", ImmutableList.of(row2)); - harness2.processElement(of(checkpointId, dataFile2), ++timestamp); - harness2.snapshot(checkpointId, ++timestamp); - - assertFlinkManifests(2); - - harness1.notifyOfCompletedCheckpoint(checkpointId); - harness2.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertSnapshotSize(4); - assertMaxCommittedCheckpointId(jobId, operatorId1, checkpointId); - assertMaxCommittedCheckpointId(jobId, operatorId2, checkpointId); - } - } - - @TestTemplate - public void testBoundedStream() throws Exception { - JobID jobId = new JobID(); - OperatorID operatorId; - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertFlinkManifests(0); - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - List tableRows = Lists.newArrayList(SimpleDataUtil.createRowData(1, "word-1")); - - DataFile dataFile = writeDataFile("data-1", tableRows); - harness.processElement(of(IcebergStreamWriter.END_INPUT_CHECKPOINT_ID, dataFile), 1); - ((BoundedOneInput) harness.getOneInputOperator()).endInput(); - - assertFlinkManifests(0); - SimpleDataUtil.assertTableRows(table, tableRows, branch); - assertSnapshotSize(1); - assertMaxCommittedCheckpointId( - jobId, operatorId, IcebergStreamWriter.END_INPUT_CHECKPOINT_ID); - assertThat(SimpleDataUtil.latestSnapshot(table, branch).summary()) - .containsEntry("flink.test", TestIcebergFilesCommitter.class.getName()); - } - } - - @TestTemplate - public void testFlinkManifests() throws Exception { - long timestamp = 0; - final long checkpoint = 10; - - JobID jobId = new JobID(); - OperatorID operatorId; - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - RowData row1 = SimpleDataUtil.createRowData(1, "hello"); - DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); - - harness.processElement(of(checkpoint, dataFile1), ++timestamp); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - // 1. snapshotState for checkpoint#1 - harness.snapshot(checkpoint, ++timestamp); - List manifestPaths = assertFlinkManifests(1); - Path manifestPath = manifestPaths.get(0); - assertThat(manifestPath.getFileName()) - .asString() - .isEqualTo( - String.format("%s-%s-%05d-%d-%d-%05d.avro", jobId, operatorId, 0, 0, checkpoint, 1)); - - // 2. Read the data files from manifests and assert. - List dataFiles = - FlinkManifestUtil.readDataFiles( - createTestingManifestFile(manifestPath, dataFile1), table.io(), table.specs()); - assertThat(dataFiles).hasSize(1); - TestHelpers.assertEquals(dataFile1, dataFiles.get(0)); - - // 3. notifyCheckpointComplete for checkpoint#1 - harness.notifyOfCompletedCheckpoint(checkpoint); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); - assertFlinkManifests(0); - } - } - - @TestTemplate - public void testDeleteFiles() throws Exception { - assumeThat(formatVersion) - .as("Only support equality-delete in format v2 or later.") - .isGreaterThan(1); - - long timestamp = 0; - long checkpoint = 10; - - JobID jobId = new JobID(); - OperatorID operatorId; - FileAppenderFactory appenderFactory = createDeletableAppenderFactory(); - - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - RowData row1 = SimpleDataUtil.createInsert(1, "aaa"); - DataFile dataFile1 = writeDataFile("data-file-1", ImmutableList.of(row1)); - harness.processElement(of(checkpoint, dataFile1), ++timestamp); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - // 1. snapshotState for checkpoint#1 - harness.snapshot(checkpoint, ++timestamp); - List manifestPaths = assertFlinkManifests(1); - Path manifestPath = manifestPaths.get(0); - assertThat(manifestPath.getFileName()) - .asString() - .isEqualTo( - String.format("%s-%s-%05d-%d-%d-%05d.avro", jobId, operatorId, 0, 0, checkpoint, 1)); - - // 2. Read the data files from manifests and assert. - List dataFiles = - FlinkManifestUtil.readDataFiles( - createTestingManifestFile(manifestPath, dataFile1), table.io(), table.specs()); - assertThat(dataFiles).hasSize(1); - TestHelpers.assertEquals(dataFile1, dataFiles.get(0)); - - // 3. notifyCheckpointComplete for checkpoint#1 - harness.notifyOfCompletedCheckpoint(checkpoint); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); - assertFlinkManifests(0); - - // 4. process both data files and delete files. - RowData row2 = SimpleDataUtil.createInsert(2, "bbb"); - DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(row2)); - - RowData delete1 = SimpleDataUtil.createDelete(1, "aaa"); - DeleteFile deleteFile1 = - writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete1)); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); - harness.processElement( - new FlinkWriteResult( - ++checkpoint, - WriteResult.builder().addDataFiles(dataFile2).addDeleteFiles(deleteFile1).build()), - ++timestamp); - - // 5. snapshotState for checkpoint#2 - harness.snapshot(checkpoint, ++timestamp); - assertFlinkManifests(2); - - // 6. notifyCheckpointComplete for checkpoint#2 - harness.notifyOfCompletedCheckpoint(checkpoint); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row2), branch); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); - assertFlinkManifests(0); - } - } - - @TestTemplate - public void testCommitTwoCheckpointsInSingleTxn() throws Exception { - assumeThat(formatVersion) - .as("Only support equality-delete in format v2 or later.") - .isGreaterThan(1); - - long timestamp = 0; - long checkpoint = 10; - - JobID jobId = new JobID(); - OperatorID operatorId; - FileAppenderFactory appenderFactory = createDeletableAppenderFactory(); - - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - RowData insert1 = SimpleDataUtil.createInsert(1, "aaa"); - RowData insert2 = SimpleDataUtil.createInsert(2, "bbb"); - RowData delete3 = SimpleDataUtil.createDelete(3, "ccc"); - DataFile dataFile1 = writeDataFile("data-file-1", ImmutableList.of(insert1, insert2)); - DeleteFile deleteFile1 = - writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete3)); - harness.processElement( - new FlinkWriteResult( - checkpoint, - WriteResult.builder().addDataFiles(dataFile1).addDeleteFiles(deleteFile1).build()), - ++timestamp); - - // The 1th snapshotState. - harness.snapshot(checkpoint, ++timestamp); - - RowData insert4 = SimpleDataUtil.createInsert(4, "ddd"); - RowData delete2 = SimpleDataUtil.createDelete(2, "bbb"); - DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(insert4)); - DeleteFile deleteFile2 = - writeEqDeleteFile(appenderFactory, "delete-file-2", ImmutableList.of(delete2)); - harness.processElement( - new FlinkWriteResult( - ++checkpoint, - WriteResult.builder().addDataFiles(dataFile2).addDeleteFiles(deleteFile2).build()), - ++timestamp); - - // The 2nd snapshotState. - harness.snapshot(checkpoint, ++timestamp); - - // Notify the 2nd snapshot to complete. - harness.notifyOfCompletedCheckpoint(checkpoint); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(insert1, insert4), branch); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); - assertFlinkManifests(0); - assertThat(table.snapshots()).hasSize(2); - } - } - - /** - * The testcase is to simulate upserting to an Iceberg V2 table, and facing the following - * scenario: - * - *
      - *
    • A specific row is updated - *
    • The prepareSnapshotPreBarrier triggered - *
    • Checkpoint failed for reasons outside of the Iceberg connector - *
    • The specific row is updated again in the second checkpoint as well - *
    • Second snapshot is triggered, and finished - *
    - * - *

    Previously the files from the 2 snapshots were committed in a single Iceberg commit, as a - * results duplicate rows were created in the table. - * - * @throws Exception Exception - */ - @TestTemplate - public void testCommitMultipleCheckpointsForV2Table() throws Exception { - assumeThat(formatVersion) - .as("Only support equality-delete in format v2 or later.") - .isGreaterThan(1); - - long timestamp = 0; - long checkpoint = 10; - - JobID jobId = new JobID(); - OperatorID operatorId; - - FileAppenderFactory appenderFactory = - new FlinkAppenderFactory( - table, - table.schema(), - FlinkSchemaUtil.convert(table.schema()), - table.properties(), - table.spec(), - new int[] {table.schema().findField("id").fieldId()}, - table.schema(), - null); - - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - RowData insert1 = null; - RowData insert2 = null; - for (int i = 1; i <= 3; i++) { - insert1 = SimpleDataUtil.createInsert(1, "aaa" + i); - insert2 = SimpleDataUtil.createInsert(2, "bbb" + i); - DataFile dataFile = writeDataFile("data-file-" + i, ImmutableList.of(insert1, insert2)); - DeleteFile deleteFile = - writeEqDeleteFile( - appenderFactory, "delete-file-" + i, ImmutableList.of(insert1, insert2)); - harness.processElement( - new FlinkWriteResult( - ++checkpoint, - WriteResult.builder().addDataFiles(dataFile).addDeleteFiles(deleteFile).build()), - ++timestamp); - } - - harness.snapshot(checkpoint, ++timestamp); - harness.notifyOfCompletedCheckpoint(checkpoint); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(insert1, insert2), branch); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); - assertFlinkManifests(0); - assertThat(table.snapshots()).hasSize(3); - } - } - - @TestTemplate - public void testSpecEvolution() throws Exception { - long timestamp = 0; - int checkpointId = 0; - List rows = Lists.newArrayList(); - JobID jobId = new JobID(); - - OperatorID operatorId; - OperatorSubtaskState snapshot; - DataFile dataFile; - int specId; - - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertSnapshotSize(0); - - checkpointId++; - RowData rowData = SimpleDataUtil.createRowData(checkpointId, "hello" + checkpointId); - // table unpartitioned - dataFile = writeDataFile("data-" + checkpointId, ImmutableList.of(rowData)); - harness.processElement(of(checkpointId, dataFile), ++timestamp); - rows.add(rowData); - harness.snapshot(checkpointId, ++timestamp); - - specId = - getStagingManifestSpecId(harness.getOperator().getOperatorStateBackend(), checkpointId); - assertThat(specId).isEqualTo(table.spec().specId()); - - harness.notifyOfCompletedCheckpoint(checkpointId); - - // Change partition spec - table.refresh(); - PartitionSpec oldSpec = table.spec(); - table.updateSpec().addField("id").commit(); - - checkpointId++; - rowData = SimpleDataUtil.createRowData(checkpointId, "hello" + checkpointId); - // write data with old partition spec - dataFile = writeDataFile("data-" + checkpointId, ImmutableList.of(rowData), oldSpec, null); - harness.processElement(of(checkpointId, dataFile), ++timestamp); - rows.add(rowData); - snapshot = harness.snapshot(checkpointId, ++timestamp); - - specId = - getStagingManifestSpecId(harness.getOperator().getOperatorStateBackend(), checkpointId); - assertThat(specId).isEqualTo(oldSpec.specId()); - - harness.notifyOfCompletedCheckpoint(checkpointId); - - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, ImmutableList.copyOf(rows), branch); - assertSnapshotSize(checkpointId); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - } - - // Restore from the given snapshot - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(jobId)) { - harness.getStreamConfig().setOperatorID(operatorId); - harness.setup(); - harness.initializeState(snapshot); - harness.open(); - - SimpleDataUtil.assertTableRows(table, rows, branch); - assertSnapshotSize(checkpointId); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - - checkpointId++; - RowData row = SimpleDataUtil.createRowData(checkpointId, "world" + checkpointId); - StructLike partition = new PartitionData(table.spec().partitionType()); - partition.set(0, checkpointId); - dataFile = - writeDataFile("data-" + checkpointId, ImmutableList.of(row), table.spec(), partition); - harness.processElement(of(checkpointId, dataFile), ++timestamp); - rows.add(row); - harness.snapshot(checkpointId, ++timestamp); - assertFlinkManifests(1); - - specId = - getStagingManifestSpecId(harness.getOperator().getOperatorStateBackend(), checkpointId); - assertThat(specId).isEqualTo(table.spec().specId()); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, rows, branch); - assertSnapshotSize(checkpointId); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - } - } - - private int getStagingManifestSpecId(OperatorStateStore operatorStateStore, long checkPointId) - throws Exception { - ListState> checkpointsState = - operatorStateStore.getListState(IcebergFilesCommitter.buildStateDescriptor()); - NavigableMap statedDataFiles = - Maps.newTreeMap(checkpointsState.get().iterator().next()); - DeltaManifests deltaManifests = - SimpleVersionedSerialization.readVersionAndDeSerialize( - DeltaManifestsSerializer.INSTANCE, statedDataFiles.get(checkPointId)); - return deltaManifests.dataManifest().partitionSpecId(); - } - - private DeleteFile writeEqDeleteFile( - FileAppenderFactory appenderFactory, String filename, List deletes) - throws IOException { - return SimpleDataUtil.writeEqDeleteFile(table, format, filename, appenderFactory, deletes); - } - - private DeleteFile writePosDeleteFile( - FileAppenderFactory appenderFactory, - String filename, - List> positions) - throws IOException { - return SimpleDataUtil.writePosDeleteFile(table, format, filename, appenderFactory, positions); - } - - private FileAppenderFactory createDeletableAppenderFactory() { - int[] equalityFieldIds = - new int[] { - table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId() - }; - return new FlinkAppenderFactory( - table, - table.schema(), - FlinkSchemaUtil.convert(table.schema()), - table.properties(), - table.spec(), - equalityFieldIds, - table.schema(), - null); - } - - private ManifestFile createTestingManifestFile(Path manifestPath, DataFile dataFile) - throws IOException { - ManifestWriter writer = - ManifestFiles.write( - formatVersion, - PartitionSpec.unpartitioned(), - table.io().newOutputFile(manifestPath.toString()), - 0L); - writer.add(dataFile); - writer.close(); - return writer.toManifestFile(); - } - - private List assertFlinkManifests(int expectedCount) throws IOException { - List manifests = - Files.list(flinkManifestFolder.toPath()) - .filter(p -> !p.toString().endsWith(".crc")) - .collect(Collectors.toList()); - assertThat(manifests).hasSize(expectedCount); - return manifests; - } - - private DataFile writeDataFile(String filename, List rows) throws IOException { - return SimpleDataUtil.writeFile( - table, - table.schema(), - table.spec(), - CONF, - table.location(), - format.addExtension(filename), - rows); - } - - private DataFile writeDataFile( - String filename, List rows, PartitionSpec spec, StructLike partition) - throws IOException { - return SimpleDataUtil.writeFile( - table, - table.schema(), - spec, - CONF, - table.location(), - format.addExtension(filename), - rows, - partition); - } - - private void assertMaxCommittedCheckpointId(JobID jobID, OperatorID operatorID, long expectedId) { - table.refresh(); - long actualId = - SinkUtil.getMaxCommittedCheckpointId( - table, jobID.toString(), operatorID.toString(), branch); - assertThat(actualId).isEqualTo(expectedId); - } - - private void assertSnapshotSize(int expectedSnapshotSize) { - table.refresh(); - assertThat(table.snapshots()).hasSize(expectedSnapshotSize); - } - - private OneInputStreamOperatorTestHarness createStreamSink(JobID jobID) - throws Exception { - TestOperatorFactory factory = TestOperatorFactory.of(table.location(), branch, table.spec()); - return new OneInputStreamOperatorTestHarness<>(factory, createEnvironment(jobID)); - } - - private static MockEnvironment createEnvironment(JobID jobID) { - return new MockEnvironmentBuilder() - .setTaskName("test task") - .setManagedMemorySize(32 * 1024) - .setInputSplitProvider(new MockInputSplitProvider()) - .setBufferSize(256) - .setTaskConfiguration(new org.apache.flink.configuration.Configuration()) - .setExecutionConfig(new ExecutionConfig()) - .setMaxParallelism(16) - .setJobID(jobID) - .build(); - } - - private static class TestOperatorFactory extends AbstractStreamOperatorFactory - implements OneInputStreamOperatorFactory { - private final String tablePath; - private final String branch; - private final PartitionSpec spec; - - private TestOperatorFactory(String tablePath, String branch, PartitionSpec spec) { - this.tablePath = tablePath; - this.branch = branch; - this.spec = spec; - } - - private static TestOperatorFactory of(String tablePath, String branch, PartitionSpec spec) { - return new TestOperatorFactory(tablePath, branch, spec); - } - - @Override - @SuppressWarnings("unchecked") - public > T createStreamOperator( - StreamOperatorParameters param) { - IcebergFilesCommitter committer = - new IcebergFilesCommitter( - new TestTableLoader(tablePath), - false, - Collections.singletonMap("flink.test", TestIcebergFilesCommitter.class.getName()), - ThreadPools.WORKER_THREAD_POOL_SIZE, - branch, - spec); - committer.setup(param.getContainingTask(), param.getStreamConfig(), param.getOutput()); - return (T) committer; - } - - @Override - public Class getStreamOperatorClass(ClassLoader classLoader) { - return IcebergFilesCommitter.class; - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSink.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSink.java deleted file mode 100644 index bf7f7b5e9815..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSink.java +++ /dev/null @@ -1,563 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.TestFixtures.DATABASE; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import org.apache.flink.api.dag.Transformation; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.Row; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.FlinkWriteOptions; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.sink.IcebergSink.Builder; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestIcebergSink extends TestFlinkIcebergSinkBase { - - private TableLoader tableLoader; - - @Parameter(index = 0) - private FileFormat format; - - @Parameter(index = 1) - private int parallelism; - - @Parameter(index = 2) - private boolean partitioned; - - @Parameter(index = 3) - private boolean isTableSchema; - - @Parameters(name = "format={0}, parallelism={1}, partitioned={2}, isTableSchema={3}") - public static Object[][] parameters() { - return new Object[][] { - // Remove after the deprecation of TableSchema - BEGIN - {FileFormat.AVRO, 1, true, true}, - {FileFormat.AVRO, 1, false, true}, - {FileFormat.AVRO, 2, true, true}, - {FileFormat.AVRO, 2, false, true}, - {FileFormat.ORC, 1, true, true}, - {FileFormat.ORC, 1, false, true}, - {FileFormat.ORC, 2, true, true}, - {FileFormat.ORC, 2, false, true}, - {FileFormat.PARQUET, 1, true, true}, - {FileFormat.PARQUET, 1, false, true}, - {FileFormat.PARQUET, 2, true, true}, - {FileFormat.PARQUET, 2, false, true}, - // Remove after the deprecation of TableSchema - END - - {FileFormat.AVRO, 1, true, false}, - {FileFormat.AVRO, 1, false, false}, - {FileFormat.AVRO, 2, true, false}, - {FileFormat.AVRO, 2, false, false}, - {FileFormat.ORC, 1, true, false}, - {FileFormat.ORC, 1, false, false}, - {FileFormat.ORC, 2, true, false}, - {FileFormat.ORC, 2, false, false}, - {FileFormat.PARQUET, 1, true, false}, - {FileFormat.PARQUET, 1, false, false}, - {FileFormat.PARQUET, 2, true, false}, - {FileFormat.PARQUET, 2, false, false}, - }; - } - - @BeforeEach - void before() throws IOException { - table = - CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - partitioned - ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() - : PartitionSpec.unpartitioned(), - ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); - - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); - - tableLoader = CATALOG_EXTENSION.tableLoader(); - } - - @TestTemplate - void testWriteRowData() throws Exception { - List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); - DataStream dataStream = - env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) - .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); - - IcebergSink.forRowData(dataStream) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .append(); - - // Execute the program. - env.execute("Test Iceberg DataStream"); - - // Assert the iceberg table's records. - SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); - } - - @TestTemplate - void testWriteRow() throws Exception { - testWriteRow(null, DistributionMode.NONE); - } - - @TestTemplate - void testWriteRowWithTableSchema() throws Exception { - testWriteRow(SimpleDataUtil.FLINK_SCHEMA, DistributionMode.NONE); - } - - @TestTemplate - void testPartitionWriteMode() throws Exception { - testWriteRow(null, DistributionMode.HASH); - if (partitioned) { - assertThat(partitionFiles("aaa")) - .as("There should be only 1 data file in partition 'aaa'") - .isEqualTo(1); - assertThat(partitionFiles("bbb")) - .as("There should be only 1 data file in partition 'bbb'") - .isEqualTo(1); - assertThat(partitionFiles("ccc")) - .as("There should be only 1 data file in partition 'ccc'") - .isEqualTo(1); - } - } - - @TestTemplate - void testShuffleByPartitionWithSchema() throws Exception { - testWriteRow(SimpleDataUtil.FLINK_SCHEMA, DistributionMode.HASH); - if (partitioned) { - assertThat(partitionFiles("aaa")) - .as("There should be only 1 data file in partition 'aaa'") - .isEqualTo(1); - assertThat(partitionFiles("bbb")) - .as("There should be only 1 data file in partition 'bbb'") - .isEqualTo(1); - assertThat(partitionFiles("ccc")) - .as("There should be only 1 data file in partition 'ccc'") - .isEqualTo(1); - } - } - - @TestTemplate - void testTwoSinksInDisjointedDAG() throws Exception { - Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - - Table leftTable = - CATALOG_EXTENSION - .catalog() - .createTable( - TableIdentifier.of("left"), - SimpleDataUtil.SCHEMA, - partitioned - ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() - : PartitionSpec.unpartitioned(), - props); - TableLoader leftTableLoader = - TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), TableIdentifier.of("left")); - - Table rightTable = - CATALOG_EXTENSION - .catalog() - .createTable( - TableIdentifier.of("right"), - SimpleDataUtil.SCHEMA, - partitioned - ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() - : PartitionSpec.unpartitioned(), - props); - TableLoader rightTableLoader = - TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), TableIdentifier.of("right")); - - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); - env.getConfig().disableAutoGeneratedUIDs(); - - List leftRows = createRows("left-"); - DataStream leftStream = - env.addSource(createBoundedSource(leftRows), ROW_TYPE_INFO) - .name("leftCustomSource") - .uid("leftCustomSource"); - - if (isTableSchema) { - IcebergSink.forRow(leftStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(leftTable) - .tableLoader(leftTableLoader) - .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) - .distributionMode(DistributionMode.NONE) - .uidSuffix("leftIcebergSink") - .append(); - } else { - IcebergSink.forRow(leftStream, SimpleDataUtil.FLINK_SCHEMA) - .table(leftTable) - .tableLoader(leftTableLoader) - .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) - .distributionMode(DistributionMode.NONE) - .uidSuffix("leftIcebergSink") - .append(); - } - - List rightRows = createRows("right-"); - DataStream rightStream = - env.addSource(createBoundedSource(rightRows), ROW_TYPE_INFO) - .name("rightCustomSource") - .uid("rightCustomSource"); - - if (isTableSchema) { - IcebergSink.forRow(rightStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(rightTable) - .tableLoader(rightTableLoader) - .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) - .writeParallelism(parallelism) - .distributionMode(DistributionMode.HASH) - .uidSuffix("rightIcebergSink") - .setSnapshotProperty("flink.test", TestIcebergSink.class.getName()) - .snapshotProperties(Collections.singletonMap("direction", "rightTable")) - .append(); - } else { - IcebergSink.forRow(rightStream, SimpleDataUtil.FLINK_SCHEMA) - .table(rightTable) - .tableLoader(rightTableLoader) - .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) - .writeParallelism(parallelism) - .distributionMode(DistributionMode.HASH) - .uidSuffix("rightIcebergSink") - .setSnapshotProperty("flink.test", TestIcebergSink.class.getName()) - .snapshotProperties(Collections.singletonMap("direction", "rightTable")) - .append(); - } - - // Execute the program. - env.execute("Test Iceberg DataStream."); - - SimpleDataUtil.assertTableRows(leftTable, convertToRowData(leftRows)); - SimpleDataUtil.assertTableRows(rightTable, convertToRowData(rightRows)); - - leftTable.refresh(); - - assertThat(leftTable.currentSnapshot().summary().get("flink.test")).isNull(); - assertThat(leftTable.currentSnapshot().summary().get("direction")).isNull(); - - assertThat(rightTable.currentSnapshot().summary().get("flink.test")) - .isEqualTo(TestIcebergSink.class.getName()); - assertThat(rightTable.currentSnapshot().summary().get("direction")).isEqualTo("rightTable"); - } - - @TestTemplate - void testOverrideWriteConfigWithUnknownFileFormat() { - Map newProps = Maps.newHashMap(); - newProps.put(FlinkWriteOptions.WRITE_FORMAT.key(), "UNRECOGNIZED"); - - List rows = createRows(""); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - - Builder builder = - isTableSchema - ? IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .setAll(newProps) - .uidSuffix("ingestion") - : IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .setAll(newProps) - .uidSuffix("ingestion"); - assertThatThrownBy(builder::append) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Invalid file format: UNRECOGNIZED"); - } - - @TestTemplate - void testWriteRowWithTableRefreshInterval() throws Exception { - List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); - DataStream dataStream = - env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) - .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); - - Configuration flinkConf = new Configuration(); - flinkConf.setString(FlinkWriteOptions.TABLE_REFRESH_INTERVAL.key(), "100ms"); - - IcebergSink.forRowData(dataStream) - .table(table) - .tableLoader(tableLoader) - .flinkConf(flinkConf) - .writeParallelism(parallelism) - .append(); - - // Execute the program. - env.execute("Test Iceberg DataStream"); - - // Assert the iceberg table's records. - SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); - } - - @TestTemplate - void testOperatorsUidNameNoUidSuffix() { - List rows = createRows(""); - DataStream dataStream = - env.addSource(createBoundedSource(rows), ROW_TYPE_INFO).uid("mySourceId"); - - if (isTableSchema) { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) - .writeParallelism(parallelism) - .distributionMode(DistributionMode.HASH) - .append(); - } else { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) - .writeParallelism(parallelism) - .distributionMode(DistributionMode.HASH) - .append(); - } - - Transformation firstTransformation = env.getTransformations().get(0); - Transformation secondTransformation = env.getTransformations().get(1); - assertThat(firstTransformation.getUid()).isEqualTo("Sink pre-writer mapper: hadoop.default.t"); - assertThat(firstTransformation.getName()).isEqualTo("Sink pre-writer mapper: hadoop.default.t"); - assertThat(secondTransformation.getUid()).isEqualTo("hadoop.default.t"); - assertThat(secondTransformation.getName()).isEqualTo("hadoop.default.t"); - } - - @TestTemplate - void testOperatorsUidNameWitUidSuffix() { - List rows = createRows(""); - DataStream dataStream = - env.addSource(createBoundedSource(rows), ROW_TYPE_INFO).uid("mySourceId"); - - if (isTableSchema) { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) - .writeParallelism(parallelism) - .distributionMode(DistributionMode.HASH) - .uidSuffix("data-ingestion") - .append(); - } else { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) - .writeParallelism(parallelism) - .distributionMode(DistributionMode.HASH) - .uidSuffix("data-ingestion") - .append(); - } - - Transformation firstTransformation = env.getTransformations().get(0); - Transformation secondTransformation = env.getTransformations().get(1); - assertThat(firstTransformation.getUid()).isEqualTo("Sink pre-writer mapper: data-ingestion"); - assertThat(firstTransformation.getName()).isEqualTo("Sink pre-writer mapper: data-ingestion"); - assertThat(secondTransformation.getUid()).isEqualTo("data-ingestion"); - assertThat(secondTransformation.getName()).isEqualTo("data-ingestion"); - } - - @TestTemplate - void testErrorOnNullForRequiredField() { - assumeThat(format) - .as("ORC file format supports null values even for required fields.") - .isNotEqualTo(FileFormat.ORC); - - Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "id2", Types.IntegerType.get()), - Types.NestedField.required(2, "data2", Types.StringType.get())); - TableIdentifier tableIdentifier = TableIdentifier.of(DATABASE, "t2"); - Table table2 = - CATALOG_EXTENSION - .catalog() - .createTable( - tableIdentifier, - icebergSchema, - PartitionSpec.unpartitioned(), - ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); - - // Null out a required field - List rows = List.of(Row.of(42, null)); - - env = StreamExecutionEnvironment.getExecutionEnvironment(); - - DataStream dataStream = - env.addSource(createBoundedSource(rows), ROW_TYPE_INFO).uid("mySourceId"); - - if (isTableSchema) { - TableSchema flinkSchema = FlinkSchemaUtil.toSchema(icebergSchema); - IcebergSink.forRow(dataStream, flinkSchema) - .table(table2) - .tableLoader(TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), tableIdentifier)) - .tableSchema(flinkSchema) - .writeParallelism(parallelism) - .append(); - } else { - ResolvedSchema flinkSchema = FlinkSchemaUtil.toResolvedSchema(icebergSchema); - IcebergSink.forRow(dataStream, flinkSchema) - .table(table2) - .tableLoader(TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), tableIdentifier)) - .resolvedSchema(flinkSchema) - .writeParallelism(parallelism) - .append(); - } - - assertThatThrownBy(() -> env.execute()).hasRootCauseInstanceOf(NullPointerException.class); - } - - @TestTemplate - void testDefaultWriteParallelism() { - List rows = createRows(""); - DataStream dataStream = - env.addSource(createBoundedSource(rows), ROW_TYPE_INFO).uid("mySourceId"); - - var sink = - isTableSchema - ? IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) - .distributionMode(DistributionMode.NONE) - .append() - : IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) - .distributionMode(DistributionMode.NONE) - .append(); - - // since the sink write parallelism was null, it asserts that the default parallelism used was - // the input source parallelism. - // sink.getTransformation is referring to the SinkV2 Writer Operator associated to the - // IcebergSink - assertThat(sink.getTransformation().getParallelism()).isEqualTo(dataStream.getParallelism()); - } - - @TestTemplate - void testWriteParallelism() { - List rows = createRows(""); - - // the parallelism of this input source is always 1, as this is a non-parallel source. - DataStream dataStream = - env.addSource(createBoundedSource(rows), ROW_TYPE_INFO).uid("mySourceId"); - - var sink = - isTableSchema - ? IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) - .distributionMode(DistributionMode.NONE) - .writeParallelism(parallelism) - .append() - : IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) - .distributionMode(DistributionMode.NONE) - .writeParallelism(parallelism) - .append(); - - // The parallelism has been properly specified when creating the IcebergSink, so this asserts - // that its value is the same as the parallelism TestTemplate parameter - // sink.getTransformation is referring to the SinkV2 Writer Operator associated to the - // IcebergSink - assertThat(sink.getTransformation().getParallelism()).isEqualTo(parallelism); - } - - private void testWriteRow(ResolvedSchema resolvedSchema, DistributionMode distributionMode) - throws Exception { - List rows = createRows(""); - DataStream dataStream = - env.addSource(createBoundedSource(rows), ROW_TYPE_INFO).uid("mySourceId"); - - if (isTableSchema) { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .tableSchema( - resolvedSchema == null ? null : TableSchema.fromResolvedSchema(resolvedSchema)) - .writeParallelism(parallelism) - .distributionMode(distributionMode) - .append(); - } else { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .resolvedSchema(resolvedSchema) - .writeParallelism(parallelism) - .distributionMode(distributionMode) - .append(); - } - - // Execute the program. - env.execute("Test Iceberg DataStream."); - - SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkBranch.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkBranch.java deleted file mode 100644 index ee5560712657..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkBranch.java +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.util.List; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.types.Row; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestIcebergSinkBranch extends TestFlinkIcebergSinkBase { - - @Parameter(index = 0) - private String branch; - - @Parameter(index = 1) - private boolean isTableSchema; - - @Parameters(name = "branch = {0}, isTableSchema = {1}") - public static Object[][] parameters() { - return new Object[][] { - // Remove after the deprecation of TableSchema - BEGIN - {"main", true}, - {"testBranch", true}, - // Remove after the deprecation of TableSchema - END - - {"main", false}, - {"testBranch", false}, - }; - } - - @BeforeEach - public void before() throws IOException { - table = - CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - PartitionSpec.unpartitioned(), - ImmutableMap.of( - TableProperties.DEFAULT_FILE_FORMAT, - FileFormat.AVRO.name(), - TableProperties.FORMAT_VERSION, - "1")); - - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100); - - tableLoader = CATALOG_EXTENSION.tableLoader(); - } - - @TestTemplate - public void testWriteRowWithTableSchema() throws Exception { - testWriteRow(SimpleDataUtil.FLINK_SCHEMA, DistributionMode.NONE); - verifyOtherBranchUnmodified(); - } - - private void testWriteRow(ResolvedSchema resolvedSchema, DistributionMode distributionMode) - throws Exception { - List rows = createRows(""); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - - if (isTableSchema) { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .tableSchema(TableSchema.fromResolvedSchema(resolvedSchema)) - .toBranch(branch) - .distributionMode(distributionMode) - .append(); - } else { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .resolvedSchema(resolvedSchema) - .toBranch(branch) - .distributionMode(distributionMode) - .append(); - } - - // Execute the program. - env.execute("Test Iceberg DataStream."); - - SimpleDataUtil.assertTableRows(table, convertToRowData(rows), branch); - SimpleDataUtil.assertTableRows( - table, - ImmutableList.of(), - branch.equals(SnapshotRef.MAIN_BRANCH) ? "test-branch" : SnapshotRef.MAIN_BRANCH); - - verifyOtherBranchUnmodified(); - } - - private void verifyOtherBranchUnmodified() { - String otherBranch = - branch.equals(SnapshotRef.MAIN_BRANCH) ? "test-branch" : SnapshotRef.MAIN_BRANCH; - if (otherBranch.equals(SnapshotRef.MAIN_BRANCH)) { - assertThat(table.currentSnapshot()).isNull(); - } - - assertThat(table.snapshot(otherBranch)).isNull(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkCompact.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkCompact.java deleted file mode 100644 index b84d21d020b3..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkCompact.java +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import org.apache.flink.runtime.jobgraph.JobVertex; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.graph.StreamGraph; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.ManifestReader; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.actions.SizeBasedFileRewritePlanner; -import org.apache.iceberg.flink.FlinkWriteOptions; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.maintenance.api.LockConfig; -import org.apache.iceberg.flink.maintenance.api.RewriteDataFilesConfig; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -class TestIcebergSinkCompact extends TestFlinkIcebergSinkBase { - - private Map flinkConf; - - @BeforeEach - void before() throws IOException { - this.flinkConf = Maps.newHashMap(); - flinkConf.put(FlinkWriteOptions.COMPACTION_ENABLE.key(), "true"); - flinkConf.put(LockConfig.LOCK_TYPE_OPTION.key(), LockConfig.JdbcLockConfig.JDBC); - flinkConf.put( - LockConfig.JdbcLockConfig.JDBC_URI_OPTION.key(), - "jdbc:sqlite:file::memory:?ic" + UUID.randomUUID().toString().replace("-", "")); - flinkConf.put(LockConfig.LOCK_ID_OPTION.key(), "test-lock-id"); - flinkConf.put(RewriteDataFilesConfig.SCHEDULE_ON_DATA_FILE_SIZE, "1"); - - flinkConf.put(LockConfig.JdbcLockConfig.JDBC_INIT_LOCK_TABLE_OPTION.key(), "true"); - flinkConf.put(RewriteDataFilesConfig.PREFIX + SizeBasedFileRewritePlanner.REWRITE_ALL, "true"); - - table = - CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - PartitionSpec.unpartitioned(), - Maps.newHashMap()); - - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100); - - tableLoader = CATALOG_EXTENSION.tableLoader(); - } - - @Test - public void testCompactFileE2e() throws Exception { - List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); - DataStream dataStream = - env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) - .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); - - IcebergSink.forRowData(dataStream) - .setAll(flinkConf) - .table(table) - .tableLoader(tableLoader) - .append(); - - env.execute("Test Iceberg Compaction DataStream"); - - table.refresh(); - // check the data file count after compact - List afterCompactDataFiles = getDataFiles(table.currentSnapshot(), table); - assertThat(afterCompactDataFiles).hasSize(1); - - // check the data file count before compact - List preCompactDataFiles = - getDataFiles(table.snapshot(table.currentSnapshot().parentId()), table); - assertThat(preCompactDataFiles).hasSize(3); - } - - private List getDataFiles(Snapshot snapshot, Table table) throws IOException { - List dataFiles = Lists.newArrayList(); - for (ManifestFile dataManifest : snapshot.dataManifests(table.io())) { - try (ManifestReader reader = ManifestFiles.read(dataManifest, table.io())) { - reader.iterator().forEachRemaining(dataFiles::add); - } - } - - return dataFiles; - } - - @Test - public void testTableMaintenanceOperatorAdded() { - List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); - DataStream dataStream = - env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) - .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); - - IcebergSink.forRowData(dataStream) - .table(table) - .tableLoader(tableLoader) - .setAll(flinkConf) - .append(); - - boolean containRewrite = false; - StreamGraph streamGraph = env.getStreamGraph(); - for (JobVertex vertex : streamGraph.getJobGraph().getVertices()) { - if (vertex.getName().contains("Rewrite")) { - containRewrite = true; - break; - } - } - - assertThat(containRewrite).isTrue(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2.java deleted file mode 100644 index f873dcd99c06..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2.java +++ /dev/null @@ -1,287 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.TestFixtures.DATABASE; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.util.List; -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.BoundedTestSource; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.Timeout; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.extension.RegisterExtension; - -@ExtendWith(ParameterizedTestExtension.class) -@Timeout(value = 60) -public class TestIcebergSinkV2 extends TestFlinkIcebergSinkV2Base { - @RegisterExtension - public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = - MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); - - @RegisterExtension - private static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); - - @BeforeEach - public void setupTable() { - table = - CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - partitioned - ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() - : PartitionSpec.unpartitioned(), - ImmutableMap.of( - TableProperties.DEFAULT_FILE_FORMAT, - format.name(), - TableProperties.FORMAT_VERSION, - String.valueOf(FORMAT_V2))); - - table - .updateProperties() - .set(TableProperties.DEFAULT_FILE_FORMAT, format.name()) - .set(TableProperties.WRITE_DISTRIBUTION_MODE, writeDistributionMode) - .commit(); - - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100L) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); - - tableLoader = CATALOG_EXTENSION.tableLoader(); - } - - @TestTemplate - public void testCheckAndGetEqualityFieldIds() { - table - .updateSchema() - .allowIncompatibleChanges() - .addRequiredColumn("type", Types.StringType.get()) - .setIdentifierFields("type") - .commit(); - - DataStream dataStream = - env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); - IcebergSink.Builder builder = - isTableSchema - ? IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA).table(table) - : IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA).table(table); - - // Use user-provided equality field column as equality field id list - builder.equalityFieldColumns(Lists.newArrayList("id")); - assertThat(SinkUtil.checkAndGetEqualityFieldIds(table, Lists.newArrayList("id"))) - .containsExactlyInAnyOrder(table.schema().findField("id").fieldId()); - } - - @TestTemplate - public void testChangeLogOnIdKey() throws Exception { - testChangeLogOnIdKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testUpsertOnlyDeletesOnDataKey() throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of(row("+I", 1, "aaa")), - ImmutableList.of(row("-D", 1, "aaa"), row("-D", 2, "bbb"))); - - List> expectedRecords = - ImmutableList.of(ImmutableList.of(record(1, "aaa")), ImmutableList.of()); - - testChangeLogs( - ImmutableList.of("data"), - row -> row.getField(ROW_DATA_POS), - true, - elementsPerCheckpoint, - expectedRecords, - SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testChangeLogOnDataKey() throws Exception { - testChangeLogOnDataKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testChangeLogOnIdDataKey() throws Exception { - testChangeLogOnIdDataKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testChangeLogOnSameKey() throws Exception { - testChangeLogOnSameKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testUpsertModeCheck() { - DataStream dataStream = - env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); - IcebergSink.Builder builder = - isTableSchema - ? IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .tableLoader(tableLoader) - .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) - .writeParallelism(parallelism) - .upsert(true) - : IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .tableLoader(tableLoader) - .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) - .writeParallelism(parallelism) - .upsert(true); - - assertThatThrownBy( - () -> - builder - .equalityFieldColumns(ImmutableList.of("id", "data")) - .overwrite(true) - .append()) - .isInstanceOf(IllegalStateException.class) - .hasMessage( - "OVERWRITE mode shouldn't be enable when configuring to use UPSERT data stream."); - - assertThatThrownBy( - () -> builder.equalityFieldColumns(ImmutableList.of()).overwrite(false).append()) - .isInstanceOf(IllegalStateException.class) - .hasMessage( - "Equality field columns shouldn't be empty when configuring to use UPSERT data stream."); - } - - @TestTemplate - public void testUpsertOnIdKey() throws Exception { - testUpsertOnIdKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testUpsertOnDataKey() throws Exception { - testUpsertOnDataKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testUpsertOnIdDataKey() throws Exception { - testUpsertOnIdDataKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testDeleteStats() throws Exception { - assumeThat(format).isNotEqualTo(FileFormat.AVRO); - - List> elementsPerCheckpoint = - ImmutableList.of( - // Checkpoint #1 - ImmutableList.of(row("+I", 1, "aaa"), row("-D", 1, "aaa"), row("+I", 1, "aaa"))); - - List> expectedRecords = ImmutableList.of(ImmutableList.of(record(1, "aaa"))); - - testChangeLogs( - ImmutableList.of("id", "data"), - row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - false, - elementsPerCheckpoint, - expectedRecords, - "main"); - - DeleteFile deleteFile = table.currentSnapshot().addedDeleteFiles(table.io()).iterator().next(); - String fromStat = - new String( - deleteFile.lowerBounds().get(MetadataColumns.DELETE_FILE_PATH.fieldId()).array()); - DataFile dataFile = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); - assumeThat(fromStat).isEqualTo(dataFile.location()); - } - - protected void testChangeLogs( - List equalityFieldColumns, - KeySelector keySelector, - boolean insertAsUpsert, - List> elementsPerCheckpoint, - List> expectedRecordsPerCheckpoint, - String branch) - throws Exception { - DataStream dataStream = - env.addSource(new BoundedTestSource<>(elementsPerCheckpoint), ROW_TYPE_INFO); - - if (isTableSchema) { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_TABLE_SCHEMA) - .tableLoader(tableLoader) - .tableSchema(SimpleDataUtil.FLINK_TABLE_SCHEMA) - .writeParallelism(parallelism) - .equalityFieldColumns(equalityFieldColumns) - .upsert(insertAsUpsert) - .toBranch(branch) - .uidSuffix("sink") - .append(); - } else { - IcebergSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .tableLoader(tableLoader) - .resolvedSchema(SimpleDataUtil.FLINK_SCHEMA) - .writeParallelism(parallelism) - .equalityFieldColumns(equalityFieldColumns) - .upsert(insertAsUpsert) - .toBranch(branch) - .uidSuffix("sink") - .append(); - } - - // Execute the program. - env.execute("Test Iceberg Change-Log DataStream."); - - table.refresh(); - List snapshots = findValidSnapshots(); - int expectedSnapshotNum = expectedRecordsPerCheckpoint.size(); - assertThat(snapshots).hasSize(expectedSnapshotNum); - - for (int i = 0; i < expectedSnapshotNum; i++) { - long snapshotId = snapshots.get(i).snapshotId(); - List expectedRecords = expectedRecordsPerCheckpoint.get(i); - assertThat(actualRowSet(snapshotId, "*")) - .as("Should have the expected records for the checkpoint#" + i) - .isEqualTo(expectedRowSet(expectedRecords.toArray(new Record[0]))); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2Branch.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2Branch.java deleted file mode 100644 index 4896f7f48c17..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergSinkV2Branch.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestIcebergSinkV2Branch extends TestFlinkIcebergSinkV2Branch { - - @BeforeEach - @Override - public void before() throws IOException { - table = - CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - PartitionSpec.unpartitioned(), - ImmutableMap.of( - TableProperties.DEFAULT_FILE_FORMAT, - FileFormat.AVRO.name(), - TableProperties.FORMAT_VERSION, - "2")); - - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100); - - tableLoader = CATALOG_EXTENSION.tableLoader(); - } - - @TestTemplate - public void testChangeLogOnIdKey() throws Exception { - testChangeLogOnIdKey(branch); - verifyOtherBranchUnmodified(); - } - - @TestTemplate - public void testChangeLogOnDataKey() throws Exception { - testChangeLogOnDataKey(branch); - verifyOtherBranchUnmodified(); - } - - @TestTemplate - public void testChangeLogOnIdDataKey() throws Exception { - testChangeLogOnIdDataKey(branch); - verifyOtherBranchUnmodified(); - } - - @TestTemplate - public void testUpsertOnIdKey() throws Exception { - testUpsertOnIdKey(branch); - verifyOtherBranchUnmodified(); - } - - @TestTemplate - public void testUpsertOnDataKey() throws Exception { - testUpsertOnDataKey(branch); - verifyOtherBranchUnmodified(); - } - - @TestTemplate - public void testUpsertOnIdDataKey() throws Exception { - testUpsertOnIdDataKey(branch); - verifyOtherBranchUnmodified(); - } - - private void verifyOtherBranchUnmodified() { - String otherBranch = - branch.equals(SnapshotRef.MAIN_BRANCH) ? "test-branch" : SnapshotRef.MAIN_BRANCH; - if (otherBranch.equals(SnapshotRef.MAIN_BRANCH)) { - assertThat(table.currentSnapshot()); - } - - assertThat(table.snapshot(otherBranch)).isNull(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java deleted file mode 100644 index 7f4f7758e519..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java +++ /dev/null @@ -1,409 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.util.Arrays; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; -import org.apache.flink.streaming.api.operators.BoundedOneInput; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkWriteConf; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestIcebergStreamWriter { - @TempDir protected java.nio.file.Path temporaryFolder; - - private Table table; - - @Parameter(index = 0) - private FileFormat format; - - @Parameter(index = 1) - private boolean partitioned; - - @Parameters(name = "format = {0}, partitioned = {1}") - public static Object[][] parameters() { - return new Object[][] { - {FileFormat.AVRO, true}, - {FileFormat.AVRO, false}, - {FileFormat.ORC, true}, - {FileFormat.ORC, false}, - {FileFormat.PARQUET, true}, - {FileFormat.PARQUET, false} - }; - } - - @BeforeEach - public void before() throws IOException { - File folder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); - // Construct the iceberg table. - Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - table = SimpleDataUtil.createTable(folder.getAbsolutePath(), props, partitioned); - } - - @TestTemplate - public void testWritingTable() throws Exception { - long checkpointId = 1L; - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - // The first checkpoint - testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); - testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 1); - testHarness.processElement(SimpleDataUtil.createRowData(3, "hello"), 1); - - testHarness.prepareSnapshotPreBarrier(checkpointId); - int expectedDataFiles = partitioned ? 2 : 1; - WriteResult result = - WriteResult.builder().addAll(getWriteResults(testHarness.extractOutputValues())).build(); - assertThat(result.deleteFiles()).isEmpty(); - assertThat(result.dataFiles()).hasSize(expectedDataFiles); - - checkpointId = checkpointId + 1; - - // The second checkpoint - testHarness.processElement(SimpleDataUtil.createRowData(4, "foo"), 1); - testHarness.processElement(SimpleDataUtil.createRowData(5, "bar"), 2); - - testHarness.prepareSnapshotPreBarrier(checkpointId); - expectedDataFiles = partitioned ? 4 : 2; - result = - WriteResult.builder().addAll(getWriteResults(testHarness.extractOutputValues())).build(); - assertThat(result.deleteFiles()).isEmpty(); - assertThat(result.dataFiles()).hasSize(expectedDataFiles); - - // Commit the iceberg transaction. - AppendFiles appendFiles = table.newAppend(); - Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); - appendFiles.commit(); - - // Assert the table records. - SimpleDataUtil.assertTableRecords( - table, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hello"), - SimpleDataUtil.createRecord(2, "world"), - SimpleDataUtil.createRecord(3, "hello"), - SimpleDataUtil.createRecord(4, "foo"), - SimpleDataUtil.createRecord(5, "bar"))); - } - } - - @TestTemplate - public void testSnapshotTwice() throws Exception { - long checkpointId = 1; - long timestamp = 1; - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), timestamp++); - testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), timestamp); - - testHarness.prepareSnapshotPreBarrier(checkpointId++); - int expectedDataFiles = partitioned ? 2 : 1; - WriteResult result = - WriteResult.builder().addAll(getWriteResults(testHarness.extractOutputValues())).build(); - assertThat(result.deleteFiles()).isEmpty(); - assertThat(result.dataFiles()).hasSize(expectedDataFiles); - - // snapshot again immediately. - for (int i = 0; i < 5; i++) { - testHarness.prepareSnapshotPreBarrier(checkpointId++); - - result = - WriteResult.builder() - .addAll(getWriteResults(testHarness.extractOutputValues())) - .build(); - assertThat(result.deleteFiles()).isEmpty(); - assertThat(result.dataFiles()).hasSize(expectedDataFiles); - } - } - } - - @TestTemplate - public void testTableWithoutSnapshot() throws Exception { - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - assertThat(testHarness.extractOutputValues()).isEmpty(); - } - // Even if we closed the iceberg stream writer, there's no orphan data file. - assertThat(scanDataFiles()).isEmpty(); - - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); - // Still not emit the data file yet, because there is no checkpoint. - assertThat(testHarness.extractOutputValues()).isEmpty(); - } - // Once we closed the iceberg stream writer, there will left an orphan data file. - assertThat(scanDataFiles()).hasSize(1); - } - - private Set scanDataFiles() throws IOException { - Path dataDir = new Path(table.location(), "data"); - FileSystem fs = FileSystem.get(new Configuration()); - if (!fs.exists(dataDir)) { - return ImmutableSet.of(); - } else { - Set paths = Sets.newHashSet(); - RemoteIterator iterators = fs.listFiles(dataDir, true); - while (iterators.hasNext()) { - LocatedFileStatus status = iterators.next(); - if (status.isFile()) { - Path path = status.getPath(); - if (path.getName().endsWith("." + format.toString().toLowerCase(Locale.ROOT))) { - paths.add(path.toString()); - } - } - } - return paths; - } - } - - @TestTemplate - public void testBoundedStreamCloseWithEmittingDataFiles() throws Exception { - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); - testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 2); - - assertThat(testHarness.getOneInputOperator()).isInstanceOf(BoundedOneInput.class); - ((BoundedOneInput) testHarness.getOneInputOperator()).endInput(); - - int expectedDataFiles = partitioned ? 2 : 1; - WriteResult result = - WriteResult.builder().addAll(getWriteResults(testHarness.extractOutputValues())).build(); - assertThat(result.deleteFiles()).isEmpty(); - assertThat(result.dataFiles()).hasSize(expectedDataFiles); - - ((BoundedOneInput) testHarness.getOneInputOperator()).endInput(); - - result = - WriteResult.builder().addAll(getWriteResults(testHarness.extractOutputValues())).build(); - assertThat(result.deleteFiles()).isEmpty(); - // Datafiles should not be sent again - assertThat(result.dataFiles()).hasSize(expectedDataFiles); - } - } - - @TestTemplate - public void testBoundedStreamTriggeredEndInputBeforeTriggeringCheckpoint() throws Exception { - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); - testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 2); - - testHarness.endInput(); - - int expectedDataFiles = partitioned ? 2 : 1; - WriteResult result = - WriteResult.builder().addAll(getWriteResults(testHarness.extractOutputValues())).build(); - assertThat(result.deleteFiles()).isEmpty(); - assertThat(result.dataFiles()).hasSize(expectedDataFiles); - - testHarness.prepareSnapshotPreBarrier(1L); - - result = - WriteResult.builder().addAll(getWriteResults(testHarness.extractOutputValues())).build(); - assertThat(result.deleteFiles()).isEmpty(); - // It should be ensured that after endInput is triggered, when prepareSnapshotPreBarrier - // is triggered, write should only send WriteResult once - assertThat(result.dataFiles()).hasSize(expectedDataFiles); - } - } - - @TestTemplate - public void testTableWithTargetFileSize() throws Exception { - // Adjust the target-file-size in table properties. - table - .updateProperties() - .set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, "4") // ~4 bytes; low enough to trigger - .commit(); - - List rows = Lists.newArrayListWithCapacity(8000); - List records = Lists.newArrayListWithCapacity(8000); - for (int i = 0; i < 2000; i++) { - for (String data : new String[] {"a", "b", "c", "d"}) { - rows.add(SimpleDataUtil.createRowData(i, data)); - records.add(SimpleDataUtil.createRecord(i, data)); - } - } - - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - for (RowData row : rows) { - testHarness.processElement(row, 1); - } - - // snapshot the operator. - testHarness.prepareSnapshotPreBarrier(1); - WriteResult result = - WriteResult.builder().addAll(getWriteResults(testHarness.extractOutputValues())).build(); - assertThat(result.deleteFiles()).isEmpty(); - assertThat(result.dataFiles()).hasSize(8); - - // Assert that the data file have the expected records. - for (DataFile dataFile : result.dataFiles()) { - assertThat(dataFile.recordCount()).isEqualTo(1000); - } - - // Commit the iceberg transaction. - AppendFiles appendFiles = table.newAppend(); - Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); - appendFiles.commit(); - } - - // Assert the table records. - SimpleDataUtil.assertTableRecords(table, records); - } - - @TestTemplate - public void testPromotedFlinkDataType() throws Exception { - Schema iSchema = - new Schema( - Types.NestedField.required(1, "tinyint", Types.IntegerType.get()), - Types.NestedField.required(2, "smallint", Types.IntegerType.get()), - Types.NestedField.optional(3, "int", Types.IntegerType.get())); - ResolvedSchema flinkSchema = - ResolvedSchema.of( - Column.physical("tinyint", DataTypes.TINYINT().notNull()), - Column.physical("smallint", DataTypes.SMALLINT().notNull()), - Column.physical("int", DataTypes.INT().nullable())); - - PartitionSpec spec; - if (partitioned) { - spec = - PartitionSpec.builderFor(iSchema) - .identity("smallint") - .identity("tinyint") - .identity("int") - .build(); - } else { - spec = PartitionSpec.unpartitioned(); - } - - String location = - Files.createTempDirectory(temporaryFolder, "junit").toFile().getAbsolutePath(); - Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - Table icebergTable = new HadoopTables().create(iSchema, spec, props, location); - - List rows = - Lists.newArrayList( - GenericRowData.of((byte) 0x01, (short) -32768, 101), - GenericRowData.of((byte) 0x02, (short) 0, 102), - GenericRowData.of((byte) 0x03, (short) 32767, 103)); - - Record record = GenericRecord.create(iSchema); - List expected = - Lists.newArrayList( - record.copy(ImmutableMap.of("tinyint", 1, "smallint", -32768, "int", 101)), - record.copy(ImmutableMap.of("tinyint", 2, "smallint", 0, "int", 102)), - record.copy(ImmutableMap.of("tinyint", 3, "smallint", 32767, "int", 103))); - - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter(icebergTable, flinkSchema)) { - for (RowData row : rows) { - testHarness.processElement(row, 1); - } - testHarness.prepareSnapshotPreBarrier(1); - WriteResult result = - WriteResult.builder().addAll(getWriteResults(testHarness.extractOutputValues())).build(); - assertThat(result.deleteFiles()).isEmpty(); - assertThat(result.dataFiles()).hasSize(partitioned ? 3 : 1); - - // Commit the iceberg transaction. - AppendFiles appendFiles = icebergTable.newAppend(); - Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); - appendFiles.commit(); - } - - SimpleDataUtil.assertTableRecords(location, expected); - } - - private static List getWriteResults(List flinkWriteResults) { - return flinkWriteResults.stream() - .map(FlinkWriteResult::writeResult) - .collect(Collectors.toList()); - } - - private OneInputStreamOperatorTestHarness createIcebergStreamWriter() - throws Exception { - return createIcebergStreamWriter(table, SimpleDataUtil.FLINK_SCHEMA); - } - - private OneInputStreamOperatorTestHarness createIcebergStreamWriter( - Table icebergTable, ResolvedSchema flinkSchema) throws Exception { - RowType flinkRowType = FlinkSink.toFlinkRowType(icebergTable.schema(), flinkSchema); - FlinkWriteConf flinkWriteConfig = - new FlinkWriteConf( - icebergTable, Maps.newHashMap(), new org.apache.flink.configuration.Configuration()); - - IcebergStreamWriter streamWriter = - FlinkSink.createStreamWriter(() -> icebergTable, flinkWriteConfig, flinkRowType, null); - OneInputStreamOperatorTestHarness harness = - new OneInputStreamOperatorTestHarness<>(streamWriter, 1, 1, 0); - - harness.setup(); - harness.open(); - - return harness; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java deleted file mode 100644 index 919fef579ab0..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java +++ /dev/null @@ -1,251 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.InternalRecordWrapper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.data.RandomRowData; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; - -public class TestRowDataPartitionKey { - private static final Schema SCHEMA = - new Schema( - Types.NestedField.required(0, "boolType", Types.BooleanType.get()), - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "longType", Types.LongType.get()), - Types.NestedField.required(3, "dateType", Types.DateType.get()), - Types.NestedField.required(4, "timeType", Types.TimeType.get()), - Types.NestedField.required(5, "stringType", Types.StringType.get()), - Types.NestedField.required(6, "timestampWithoutZone", Types.TimestampType.withoutZone()), - Types.NestedField.required(7, "timestampWithZone", Types.TimestampType.withZone()), - Types.NestedField.required(8, "fixedType", Types.FixedType.ofLength(5)), - Types.NestedField.required(9, "uuidType", Types.UUIDType.get()), - Types.NestedField.required(10, "binaryType", Types.BinaryType.get()), - Types.NestedField.required(11, "decimalType1", Types.DecimalType.of(18, 3)), - Types.NestedField.required(12, "decimalType2", Types.DecimalType.of(10, 5)), - Types.NestedField.required(13, "decimalType3", Types.DecimalType.of(38, 19)), - Types.NestedField.required(14, "floatType", Types.FloatType.get()), - Types.NestedField.required(15, "doubleType", Types.DoubleType.get())); - - private static final List SUPPORTED_PRIMITIVES = - SCHEMA.asStruct().fields().stream().map(Types.NestedField::name).collect(Collectors.toList()); - - private static final Schema NESTED_SCHEMA = - new Schema( - Types.NestedField.required( - 1, - "structType", - Types.StructType.of( - Types.NestedField.optional(2, "innerStringType", Types.StringType.get()), - Types.NestedField.optional(3, "innerIntegerType", Types.IntegerType.get())))); - - @Test - public void testNullPartitionValue() { - Schema schema = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); - - PartitionSpec spec = PartitionSpec.builderFor(schema).identity("data").build(); - - List rows = - Lists.newArrayList( - GenericRowData.of(1, StringData.fromString("a")), - GenericRowData.of(2, StringData.fromString("b")), - GenericRowData.of(3, null)); - - RowDataWrapper rowWrapper = - new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); - - for (RowData row : rows) { - PartitionKey partitionKey = new PartitionKey(spec, schema); - partitionKey.partition(rowWrapper.wrap(row)); - assertThat(partitionKey.size()).isEqualTo(1); - - String expectedStr = row.isNullAt(1) ? null : row.getString(1).toString(); - assertThat(partitionKey.get(0, String.class)).isEqualTo(expectedStr); - } - } - - @Test - public void testPartitionWithOneNestedField() { - RowDataWrapper rowWrapper = - new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); - List records = RandomGenericData.generate(NESTED_SCHEMA, 10, 1991); - List rows = Lists.newArrayList(RandomRowData.convert(NESTED_SCHEMA, records)); - - PartitionSpec spec1 = - PartitionSpec.builderFor(NESTED_SCHEMA).identity("structType.innerStringType").build(); - PartitionSpec spec2 = - PartitionSpec.builderFor(NESTED_SCHEMA).identity("structType.innerIntegerType").build(); - - for (int i = 0; i < rows.size(); i++) { - RowData row = rows.get(i); - Record record = (Record) records.get(i).get(0); - - PartitionKey partitionKey1 = new PartitionKey(spec1, NESTED_SCHEMA); - partitionKey1.partition(rowWrapper.wrap(row)); - assertThat(partitionKey1.size()).isEqualTo(1); - - assertThat(partitionKey1.get(0, String.class)).isEqualTo(record.get(0)); - - PartitionKey partitionKey2 = new PartitionKey(spec2, NESTED_SCHEMA); - partitionKey2.partition(rowWrapper.wrap(row)); - assertThat(partitionKey2.size()).isEqualTo(1); - - assertThat(partitionKey2.get(0, Integer.class)).isEqualTo(record.get(1)); - } - } - - @Test - public void testPartitionMultipleNestedField() { - RowDataWrapper rowWrapper = - new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); - List records = RandomGenericData.generate(NESTED_SCHEMA, 10, 1992); - List rows = Lists.newArrayList(RandomRowData.convert(NESTED_SCHEMA, records)); - - PartitionSpec spec1 = - PartitionSpec.builderFor(NESTED_SCHEMA) - .identity("structType.innerIntegerType") - .identity("structType.innerStringType") - .build(); - PartitionSpec spec2 = - PartitionSpec.builderFor(NESTED_SCHEMA) - .identity("structType.innerStringType") - .identity("structType.innerIntegerType") - .build(); - - PartitionKey pk1 = new PartitionKey(spec1, NESTED_SCHEMA); - PartitionKey pk2 = new PartitionKey(spec2, NESTED_SCHEMA); - - for (int i = 0; i < rows.size(); i++) { - RowData row = rows.get(i); - Record record = (Record) records.get(i).get(0); - - pk1.partition(rowWrapper.wrap(row)); - assertThat(pk1.size()).isEqualTo(2); - - assertThat(pk1.get(0, Integer.class)).isEqualTo(record.get(1)); - assertThat(pk1.get(1, String.class)).isEqualTo(record.get(0)); - - pk2.partition(rowWrapper.wrap(row)); - assertThat(pk2.size()).isEqualTo(2); - - assertThat(pk2.get(0, String.class)).isEqualTo(record.get(0)); - assertThat(pk2.get(1, Integer.class)).isEqualTo(record.get(1)); - } - } - - @Test - public void testPartitionValueTypes() { - RowType rowType = FlinkSchemaUtil.convert(SCHEMA); - RowDataWrapper rowWrapper = new RowDataWrapper(rowType, SCHEMA.asStruct()); - InternalRecordWrapper recordWrapper = new InternalRecordWrapper(SCHEMA.asStruct()); - - List records = RandomGenericData.generate(SCHEMA, 10, 1993); - List rows = Lists.newArrayList(RandomRowData.convert(SCHEMA, records)); - - for (String column : SUPPORTED_PRIMITIVES) { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity(column).build(); - Class[] javaClasses = spec.javaClasses(); - - PartitionKey pk = new PartitionKey(spec, SCHEMA); - PartitionKey expectedPK = new PartitionKey(spec, SCHEMA); - - for (int j = 0; j < rows.size(); j++) { - RowData row = rows.get(j); - Record record = records.get(j); - - pk.partition(rowWrapper.wrap(row)); - expectedPK.partition(recordWrapper.wrap(record)); - - assertThat(pk.size()) - .as("Partition with column " + column + " should have one field.") - .isEqualTo(1); - - if (column.equals("timeType")) { - assertThat(pk.get(0, Long.class) / 1000) - .as("Partition with column " + column + " should have the expected values") - .isEqualTo(expectedPK.get(0, Long.class) / 1000); - } else { - assertThat(pk.get(0, javaClasses[0])) - .as("Partition with column " + column + " should have the expected values") - .isEqualTo(expectedPK.get(0, javaClasses[0])); - } - } - } - } - - @Test - public void testNestedPartitionValues() { - Schema nestedSchema = new Schema(Types.NestedField.optional(1001, "nested", SCHEMA.asStruct())); - RowType rowType = FlinkSchemaUtil.convert(nestedSchema); - - RowDataWrapper rowWrapper = new RowDataWrapper(rowType, nestedSchema.asStruct()); - InternalRecordWrapper recordWrapper = new InternalRecordWrapper(nestedSchema.asStruct()); - - List records = RandomGenericData.generate(nestedSchema, 10, 1994); - List rows = Lists.newArrayList(RandomRowData.convert(nestedSchema, records)); - - for (String supportedPrimitive : SUPPORTED_PRIMITIVES) { - String column = String.format("nested.%s", supportedPrimitive); - - PartitionSpec spec = PartitionSpec.builderFor(nestedSchema).identity(column).build(); - Class[] javaClasses = spec.javaClasses(); - - PartitionKey pk = new PartitionKey(spec, nestedSchema); - PartitionKey expectedPK = new PartitionKey(spec, nestedSchema); - - for (int j = 0; j < rows.size(); j++) { - pk.partition(rowWrapper.wrap(rows.get(j))); - expectedPK.partition(recordWrapper.wrap(records.get(j))); - - assertThat(pk.size()) - .as("Partition with nested column " + column + " should have one field.") - .isEqualTo(1); - - if (column.equals("nested.timeType")) { - assertThat(pk.get(0, Long.class) / 1000) - .as("Partition with nested column " + column + " should have the expected values.") - .isEqualTo(expectedPK.get(0, Long.class) / 1000); - } else { - assertThat(pk.get(0, javaClasses[0])) - .as("Partition with nested column " + column + " should have the expected values.") - .isEqualTo(expectedPK.get(0, javaClasses[0])); - } - } - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java deleted file mode 100644 index 6b7b0d4c35a3..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.util.List; -import java.util.Map; -import org.apache.flink.table.data.RowData; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.data.RandomRowData; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestTaskWriters { - private static final Configuration CONF = new Configuration(); - private static final long TARGET_FILE_SIZE = 128 * 1024 * 1024; - - @TempDir protected java.nio.file.Path temporaryFolder; - - @Parameters(name = "format = {0}, partitioned = {1}") - public static Object[][] parameters() { - return new Object[][] { - {FileFormat.AVRO, true}, - {FileFormat.AVRO, false}, - {FileFormat.ORC, true}, - {FileFormat.ORC, false}, - {FileFormat.PARQUET, true}, - {FileFormat.PARQUET, false} - }; - } - - @Parameter(index = 0) - private FileFormat format; - - @Parameter(index = 1) - private boolean partitioned; - - private Table table; - - @BeforeEach - public void before() throws IOException { - File folder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); - // Construct the iceberg table with the specified file format. - Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - table = SimpleDataUtil.createTable(folder.getAbsolutePath(), props, partitioned); - } - - @TestTemplate - public void testWriteZeroRecord() throws IOException { - try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { - taskWriter.close(); - - DataFile[] dataFiles = taskWriter.dataFiles(); - assertThat(dataFiles).isNotNull().isEmpty(); - - // Close again. - taskWriter.close(); - dataFiles = taskWriter.dataFiles(); - assertThat(dataFiles).isNotNull().isEmpty(); - } - } - - @TestTemplate - public void testCloseTwice() throws IOException { - try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { - taskWriter.write(SimpleDataUtil.createRowData(1, "hello")); - taskWriter.write(SimpleDataUtil.createRowData(2, "world")); - taskWriter.close(); // The first close - taskWriter.close(); // The second close - - int expectedFiles = partitioned ? 2 : 1; - DataFile[] dataFiles = taskWriter.dataFiles(); - assertThat(dataFiles).hasSize(expectedFiles); - - FileSystem fs = FileSystem.get(CONF); - for (DataFile dataFile : dataFiles) { - assertThat(fs.exists(new Path(dataFile.location()))).isTrue(); - } - } - } - - @TestTemplate - public void testAbort() throws IOException { - try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { - taskWriter.write(SimpleDataUtil.createRowData(1, "hello")); - taskWriter.write(SimpleDataUtil.createRowData(2, "world")); - - taskWriter.abort(); - DataFile[] dataFiles = taskWriter.dataFiles(); - - int expectedFiles = partitioned ? 2 : 1; - assertThat(dataFiles).hasSize(expectedFiles); - - FileSystem fs = FileSystem.get(CONF); - for (DataFile dataFile : dataFiles) { - assertThat(fs.exists(new Path(dataFile.location()))).isFalse(); - } - } - } - - @TestTemplate - public void testCompleteFiles() throws IOException { - try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { - taskWriter.write(SimpleDataUtil.createRowData(1, "a")); - taskWriter.write(SimpleDataUtil.createRowData(2, "b")); - taskWriter.write(SimpleDataUtil.createRowData(3, "c")); - taskWriter.write(SimpleDataUtil.createRowData(4, "d")); - - DataFile[] dataFiles = taskWriter.dataFiles(); - int expectedFiles = partitioned ? 4 : 1; - assertThat(dataFiles).hasSize(expectedFiles); - - dataFiles = taskWriter.dataFiles(); - assertThat(dataFiles).hasSize(expectedFiles); - - FileSystem fs = FileSystem.get(CONF); - for (DataFile dataFile : dataFiles) { - assertThat(fs.exists(new Path(dataFile.location()))).isTrue(); - } - - AppendFiles appendFiles = table.newAppend(); - for (DataFile dataFile : dataFiles) { - appendFiles.appendFile(dataFile); - } - appendFiles.commit(); - - // Assert the data rows. - SimpleDataUtil.assertTableRecords( - table, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "b"), - SimpleDataUtil.createRecord(3, "c"), - SimpleDataUtil.createRecord(4, "d"))); - } - } - - @TestTemplate - public void testRollingWithTargetFileSize() throws IOException { - try (TaskWriter taskWriter = createTaskWriter(4)) { - List rows = Lists.newArrayListWithCapacity(8000); - List records = Lists.newArrayListWithCapacity(8000); - for (int i = 0; i < 2000; i++) { - for (String data : new String[] {"a", "b", "c", "d"}) { - rows.add(SimpleDataUtil.createRowData(i, data)); - records.add(SimpleDataUtil.createRecord(i, data)); - } - } - - for (RowData row : rows) { - taskWriter.write(row); - } - - DataFile[] dataFiles = taskWriter.dataFiles(); - assertThat(dataFiles).hasSize(8); - - AppendFiles appendFiles = table.newAppend(); - for (DataFile dataFile : dataFiles) { - appendFiles.appendFile(dataFile); - } - appendFiles.commit(); - - // Assert the data rows. - SimpleDataUtil.assertTableRecords(table, records); - } - } - - @TestTemplate - public void testRandomData() throws IOException { - try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { - Iterable rows = RandomRowData.generate(SimpleDataUtil.SCHEMA, 100, 1996); - for (RowData row : rows) { - taskWriter.write(row); - } - - taskWriter.close(); - DataFile[] dataFiles = taskWriter.dataFiles(); - AppendFiles appendFiles = table.newAppend(); - for (DataFile dataFile : dataFiles) { - appendFiles.appendFile(dataFile); - } - appendFiles.commit(); - - // Assert the data rows. - SimpleDataUtil.assertTableRows(table, Lists.newArrayList(rows)); - } - } - - private TaskWriter createTaskWriter(long targetFileSize) { - TaskWriterFactory taskWriterFactory = - new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - SimpleDataUtil.ROW_TYPE, - targetFileSize, - format, - table.properties(), - null, - false); - taskWriterFactory.initialize(1, 1); - return taskWriterFactory.create(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializerTestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializerTestBase.java deleted file mode 100644 index 30782e8d4170..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordInternalSerializerTestBase.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.util.Collections; -import org.apache.flink.api.common.typeutils.SerializerTestBase; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.extension.RegisterExtension; - -/** - * Test base for DynamicRecordInternalSerializer which allows to instantiate different serializer - * version, e.g. with writing the schema itself or just the schema id. - */ -abstract class DynamicRecordInternalSerializerTestBase - extends SerializerTestBase { - - static final String TABLE = "myTable"; - static final String BRANCH = "myBranch"; - - @RegisterExtension - static final HadoopCatalogExtension CATALOG_EXTENSION = new HadoopCatalogExtension("db", TABLE); - - static final Schema SCHEMA = - new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get()), - required(3, "number", Types.FloatType.get())); - - static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).bucket("id", 10).build(); - - private boolean writeFullSchemaAndSpec; - - DynamicRecordInternalSerializerTestBase(boolean writeFullSchemaAndSpec) { - this.writeFullSchemaAndSpec = writeFullSchemaAndSpec; - } - - @Override - protected TypeSerializer createSerializer() { - return new DynamicRecordInternalSerializer( - new TableSerializerCache(CATALOG_EXTENSION.catalogLoader(), 1), writeFullSchemaAndSpec); - } - - @BeforeEach - void before() { - CATALOG_EXTENSION.catalog().createTable(TableIdentifier.parse(TABLE), SCHEMA, SPEC); - } - - @Override - protected DynamicRecordInternal[] getTestData() { - GenericRowData rowData = new GenericRowData(3); - rowData.setField(0, 123L); - rowData.setField(1, StringData.fromString("test")); - rowData.setField(2, 1.23f); - - return new DynamicRecordInternal[] { - new DynamicRecordInternal( - TABLE, BRANCH, SCHEMA, rowData, SPEC, 42, false, Collections.emptySet()) - }; - } - - @Override - protected Class getTypeClass() { - return DynamicRecordInternal.class; - } - - @Override - protected int getLength() { - return -1; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestCompareSchemasVisitor.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestCompareSchemasVisitor.java deleted file mode 100644 index 385a354889fb..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestCompareSchemasVisitor.java +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.iceberg.Schema; -import org.apache.iceberg.types.Types.IntegerType; -import org.apache.iceberg.types.Types.ListType; -import org.apache.iceberg.types.Types.LongType; -import org.apache.iceberg.types.Types.MapType; -import org.apache.iceberg.types.Types.StringType; -import org.apache.iceberg.types.Types.StructType; -import org.junit.jupiter.api.Test; - -class TestCompareSchemasVisitor { - - @Test - void testSchema() { - assertThat( - CompareSchemasVisitor.visit( - new Schema( - optional(1, "id", IntegerType.get(), "comment"), - optional(2, "data", StringType.get()), - optional(3, "extra", StringType.get())), - new Schema( - optional(1, "id", IntegerType.get(), "comment"), - optional(2, "data", StringType.get()), - optional(3, "extra", StringType.get())))) - .isEqualTo(CompareSchemasVisitor.Result.SAME); - } - - @Test - void testSchemaDifferentId() { - assertThat( - CompareSchemasVisitor.visit( - new Schema( - optional(0, "id", IntegerType.get()), - optional(1, "data", StringType.get()), - optional(2, "extra", StringType.get())), - new Schema( - optional(1, "id", IntegerType.get()), - optional(2, "data", StringType.get()), - optional(3, "extra", StringType.get())))) - .isEqualTo(CompareSchemasVisitor.Result.SAME); - } - - @Test - void testSchemaDifferent() { - assertThat( - CompareSchemasVisitor.visit( - new Schema( - optional(0, "id", IntegerType.get()), - optional(1, "data", StringType.get()), - optional(2, "extra", StringType.get())), - new Schema( - optional(0, "id", IntegerType.get()), optional(1, "data", StringType.get())))) - .isEqualTo(CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED); - } - - @Test - void testSchemaWithMoreColumns() { - assertThat( - CompareSchemasVisitor.visit( - new Schema( - optional(0, "id", IntegerType.get()), optional(1, "data", StringType.get())), - new Schema( - optional(0, "id", IntegerType.get()), - optional(1, "data", StringType.get()), - optional(2, "extra", StringType.get())))) - .isEqualTo(CompareSchemasVisitor.Result.DATA_CONVERSION_NEEDED); - } - - @Test - void testDifferentType() { - assertThat( - CompareSchemasVisitor.visit( - new Schema( - optional(1, "id", LongType.get()), optional(2, "extra", StringType.get())), - new Schema( - optional(1, "id", IntegerType.get()), optional(2, "extra", StringType.get())))) - .isEqualTo(CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED); - } - - @Test - void testCompatibleType() { - assertThat( - CompareSchemasVisitor.visit( - new Schema( - optional(1, "id", IntegerType.get()), optional(2, "extra", StringType.get())), - new Schema( - optional(1, "id", LongType.get()), optional(2, "extra", StringType.get())))) - .isEqualTo(CompareSchemasVisitor.Result.DATA_CONVERSION_NEEDED); - } - - @Test - void testRequiredChangeForMatchingField() { - Schema dataSchema = - new Schema(optional(1, "id", IntegerType.get()), optional(2, "extra", StringType.get())); - Schema tableSchema = - new Schema(required(1, "id", IntegerType.get()), optional(2, "extra", StringType.get())); - assertThat(CompareSchemasVisitor.visit(dataSchema, tableSchema)) - .isEqualTo(CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED); - assertThat(CompareSchemasVisitor.visit(tableSchema, dataSchema)) - .isEqualTo(CompareSchemasVisitor.Result.SAME); - } - - @Test - void testRequiredChangeForNonMatchingField() { - Schema dataSchema = new Schema(optional(1, "id", IntegerType.get())); - Schema tableSchema = - new Schema(optional(1, "id", IntegerType.get()), required(2, "extra", StringType.get())); - assertThat(CompareSchemasVisitor.visit(dataSchema, tableSchema)) - .isEqualTo(CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED); - assertThat(CompareSchemasVisitor.visit(tableSchema, dataSchema)) - .isEqualTo(CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED); - } - - @Test - void testNoRequiredChangeForNonMatchingField() { - Schema dataSchema = new Schema(required(1, "id", IntegerType.get())); - Schema tableSchema = - new Schema(required(1, "id", IntegerType.get()), optional(2, "extra", StringType.get())); - assertThat(CompareSchemasVisitor.visit(dataSchema, tableSchema)) - .isEqualTo(CompareSchemasVisitor.Result.DATA_CONVERSION_NEEDED); - } - - @Test - void testStructDifferentId() { - assertThat( - CompareSchemasVisitor.visit( - new Schema( - optional(1, "id", IntegerType.get()), - optional(2, "struct1", StructType.of(optional(3, "extra", IntegerType.get())))), - new Schema( - optional(0, "id", IntegerType.get()), - optional( - 1, "struct1", StructType.of(optional(2, "extra", IntegerType.get())))))) - .isEqualTo(CompareSchemasVisitor.Result.SAME); - } - - @Test - void testStructChanged() { - assertThat( - CompareSchemasVisitor.visit( - new Schema( - optional(0, "id", IntegerType.get()), - optional(1, "struct1", StructType.of(optional(2, "extra", LongType.get())))), - new Schema( - optional(1, "id", IntegerType.get()), - optional( - 2, "struct1", StructType.of(optional(3, "extra", IntegerType.get())))))) - .isEqualTo(CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED); - } - - @Test - void testMapDifferentId() { - assertThat( - CompareSchemasVisitor.visit( - new Schema( - optional(1, "id", IntegerType.get()), - optional( - 2, "map1", MapType.ofOptional(3, 4, IntegerType.get(), StringType.get()))), - new Schema( - optional(0, "id", IntegerType.get()), - optional( - 1, "map1", MapType.ofOptional(2, 3, IntegerType.get(), StringType.get()))))) - .isEqualTo(CompareSchemasVisitor.Result.SAME); - } - - @Test - void testMapChanged() { - assertThat( - CompareSchemasVisitor.visit( - new Schema( - optional(1, "id", IntegerType.get()), - optional( - 2, "map1", MapType.ofOptional(3, 4, LongType.get(), StringType.get()))), - new Schema( - optional(1, "id", IntegerType.get()), - optional( - 2, "map1", MapType.ofOptional(3, 4, IntegerType.get(), StringType.get()))))) - .isEqualTo(CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED); - } - - @Test - void testListDifferentId() { - assertThat( - CompareSchemasVisitor.visit( - new Schema( - optional(1, "id", IntegerType.get()), - optional(2, "list1", ListType.ofOptional(3, IntegerType.get()))), - new Schema( - optional(0, "id", IntegerType.get()), - optional(1, "list1", ListType.ofOptional(2, IntegerType.get()))))) - .isEqualTo(CompareSchemasVisitor.Result.SAME); - } - - @Test - void testListChanged() { - assertThat( - CompareSchemasVisitor.visit( - new Schema( - optional(0, "id", IntegerType.get()), - optional(1, "list1", ListType.ofOptional(2, LongType.get()))), - new Schema( - optional(1, "id", IntegerType.get()), - optional(2, "list1", ListType.ofOptional(3, IntegerType.get()))))) - .isEqualTo(CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommittableSerializer.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommittableSerializer.java deleted file mode 100644 index 13a06d362717..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommittableSerializer.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.IOException; -import org.apache.flink.api.common.JobID; -import org.apache.flink.runtime.jobgraph.OperatorID; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.junit.jupiter.api.Test; - -class TestDynamicCommittableSerializer { - - @Test - void testRoundtrip() throws IOException { - DynamicCommittable committable = - new DynamicCommittable( - new WriteTarget("table", "branch", 42, 23, false, Sets.newHashSet(1, 2)), - new byte[] {3, 4}, - JobID.generate().toHexString(), - new OperatorID().toHexString(), - 5); - - DynamicCommittableSerializer serializer = new DynamicCommittableSerializer(); - assertThat(serializer.deserialize(serializer.getVersion(), serializer.serialize(committable))) - .isEqualTo(committable); - } - - @Test - void testUnsupportedVersion() throws IOException { - DynamicCommittable committable = - new DynamicCommittable( - new WriteTarget("table", "branch", 42, 23, false, Sets.newHashSet(1, 2)), - new byte[] {3, 4}, - JobID.generate().toHexString(), - new OperatorID().toHexString(), - 5); - - DynamicCommittableSerializer serializer = new DynamicCommittableSerializer(); - assertThatThrownBy(() -> serializer.deserialize(-1, serializer.serialize(committable))) - .hasMessage("Unrecognized version or corrupt state: -1") - .isInstanceOf(IOException.class); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommitter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommitter.java deleted file mode 100644 index 99a546536208..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicCommitter.java +++ /dev/null @@ -1,381 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.nio.ByteBuffer; -import java.util.Map; -import org.apache.flink.api.common.JobID; -import org.apache.flink.api.connector.sink2.Committer.CommitRequest; -import org.apache.flink.api.connector.sink2.mocks.MockCommitRequest; -import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; -import org.apache.flink.runtime.jobgraph.OperatorID; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.Metrics; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.RegisterExtension; - -class TestDynamicCommitter { - - static final String DB = "db"; - static final String TABLE1 = "table"; - static final String TABLE2 = "table2"; - - @RegisterExtension - static final HadoopCatalogExtension CATALOG_EXTENSION = new HadoopCatalogExtension(DB, TABLE1); - - Catalog catalog; - - private static final DataFile DATA_FILE = - DataFiles.builder(PartitionSpec.unpartitioned()) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(0) - .withMetrics( - new Metrics( - 42L, - null, // no column sizes - ImmutableMap.of(1, 5L), // value count - ImmutableMap.of(1, 0L), // null count - null, - ImmutableMap.of(1, ByteBuffer.allocate(1)), // lower bounds - ImmutableMap.of(1, ByteBuffer.allocate(1)) // upper bounds - )) - .build(); - - @BeforeEach - void before() { - catalog = CATALOG_EXTENSION.catalog(); - Schema schema1 = new Schema(42); - Schema schema2 = new Schema(43); - catalog.createTable(TableIdentifier.of(TABLE1), schema1); - catalog.createTable(TableIdentifier.of(TABLE2), schema2); - } - - @Test - void testCommit() throws Exception { - Table table1 = catalog.loadTable(TableIdentifier.of(TABLE1)); - assertThat(table1.snapshots()).isEmpty(); - Table table2 = catalog.loadTable(TableIdentifier.of(TABLE2)); - assertThat(table2.snapshots()).isEmpty(); - - boolean overwriteMode = false; - int workerPoolSize = 1; - String sinkId = "sinkId"; - UnregisteredMetricsGroup metricGroup = new UnregisteredMetricsGroup(); - DynamicCommitterMetrics committerMetrics = new DynamicCommitterMetrics(metricGroup); - DynamicCommitter dynamicCommitter = - new DynamicCommitter( - CATALOG_EXTENSION.catalog(), - Maps.newHashMap(), - overwriteMode, - workerPoolSize, - sinkId, - committerMetrics); - - WriteTarget writeTarget1 = - new WriteTarget(TABLE1, "branch", 42, 0, true, Sets.newHashSet(1, 2)); - WriteTarget writeTarget2 = - new WriteTarget(TABLE1, "branch2", 43, 0, true, Sets.newHashSet(1, 2)); - WriteTarget writeTarget3 = - new WriteTarget(TABLE2, "branch2", 43, 0, true, Sets.newHashSet(1, 2)); - - DynamicWriteResultAggregator aggregator = - new DynamicWriteResultAggregator(CATALOG_EXTENSION.catalogLoader()); - OneInputStreamOperatorTestHarness aggregatorHarness = - new OneInputStreamOperatorTestHarness(aggregator); - aggregatorHarness.open(); - - byte[] deltaManifest1 = - aggregator.writeToManifest( - writeTarget1, - Sets.newHashSet( - new DynamicWriteResult( - writeTarget1, WriteResult.builder().addDataFiles(DATA_FILE).build())), - 0); - byte[] deltaManifest2 = - aggregator.writeToManifest( - writeTarget2, - Sets.newHashSet( - new DynamicWriteResult( - writeTarget2, WriteResult.builder().addDataFiles(DATA_FILE).build())), - 0); - byte[] deltaManifest3 = - aggregator.writeToManifest( - writeTarget3, - Sets.newHashSet( - new DynamicWriteResult( - writeTarget3, WriteResult.builder().addDataFiles(DATA_FILE).build())), - 0); - - final String jobId = JobID.generate().toHexString(); - final String operatorId = new OperatorID().toHexString(); - final int checkpointId = 10; - - CommitRequest commitRequest1 = - new MockCommitRequest<>( - new DynamicCommittable(writeTarget1, deltaManifest1, jobId, operatorId, checkpointId)); - - CommitRequest commitRequest2 = - new MockCommitRequest<>( - new DynamicCommittable(writeTarget2, deltaManifest2, jobId, operatorId, checkpointId)); - - CommitRequest commitRequest3 = - new MockCommitRequest<>( - new DynamicCommittable(writeTarget3, deltaManifest3, jobId, operatorId, checkpointId)); - - dynamicCommitter.commit(Sets.newHashSet(commitRequest1, commitRequest2, commitRequest3)); - - table1.refresh(); - assertThat(table1.snapshots()).hasSize(2); - Snapshot first = Iterables.getFirst(table1.snapshots(), null); - assertThat(first.summary()) - .containsAllEntriesOf( - (Map) - ImmutableMap.builder() - .put("added-data-files", "1") - .put("added-records", "42") - .put("changed-partition-count", "1") - .put("flink.job-id", jobId) - .put("flink.max-committed-checkpoint-id", "" + checkpointId) - .put("flink.operator-id", operatorId) - .put("total-data-files", "1") - .put("total-delete-files", "0") - .put("total-equality-deletes", "0") - .put("total-files-size", "0") - .put("total-position-deletes", "0") - .put("total-records", "42") - .build()); - Snapshot second = Iterables.get(table1.snapshots(), 1, null); - assertThat(second.summary()) - .containsAllEntriesOf( - (Map) - ImmutableMap.builder() - .put("added-data-files", "1") - .put("added-records", "42") - .put("changed-partition-count", "1") - .put("flink.job-id", jobId) - .put("flink.max-committed-checkpoint-id", "" + checkpointId) - .put("flink.operator-id", operatorId) - .put("total-data-files", "1") - .put("total-delete-files", "0") - .put("total-equality-deletes", "0") - .put("total-files-size", "0") - .put("total-position-deletes", "0") - .put("total-records", "42") - .build()); - - table2.refresh(); - assertThat(table2.snapshots()).hasSize(1); - Snapshot third = Iterables.getFirst(table2.snapshots(), null); - assertThat(third.summary()) - .containsAllEntriesOf( - (Map) - ImmutableMap.builder() - .put("added-data-files", "1") - .put("added-records", "42") - .put("changed-partition-count", "1") - .put("flink.job-id", jobId) - .put("flink.max-committed-checkpoint-id", "" + checkpointId) - .put("flink.operator-id", operatorId) - .put("total-data-files", "1") - .put("total-delete-files", "0") - .put("total-equality-deletes", "0") - .put("total-files-size", "0") - .put("total-position-deletes", "0") - .put("total-records", "42") - .build()); - } - - @Test - void testAlreadyCommitted() throws Exception { - Table table1 = catalog.loadTable(TableIdentifier.of(TABLE1)); - assertThat(table1.snapshots()).isEmpty(); - - boolean overwriteMode = false; - int workerPoolSize = 1; - String sinkId = "sinkId"; - UnregisteredMetricsGroup metricGroup = new UnregisteredMetricsGroup(); - DynamicCommitterMetrics committerMetrics = new DynamicCommitterMetrics(metricGroup); - DynamicCommitter dynamicCommitter = - new DynamicCommitter( - CATALOG_EXTENSION.catalog(), - Maps.newHashMap(), - overwriteMode, - workerPoolSize, - sinkId, - committerMetrics); - - WriteTarget writeTarget = - new WriteTarget(TABLE1, "branch", 42, 0, false, Sets.newHashSet(1, 2)); - - DynamicWriteResultAggregator aggregator = - new DynamicWriteResultAggregator(CATALOG_EXTENSION.catalogLoader()); - OneInputStreamOperatorTestHarness aggregatorHarness = - new OneInputStreamOperatorTestHarness(aggregator); - aggregatorHarness.open(); - - final String jobId = JobID.generate().toHexString(); - final String operatorId = new OperatorID().toHexString(); - final int checkpointId = 10; - - byte[] deltaManifest = - aggregator.writeToManifest( - writeTarget, - Sets.newHashSet( - new DynamicWriteResult( - writeTarget, WriteResult.builder().addDataFiles(DATA_FILE).build())), - checkpointId); - - CommitRequest commitRequest = - new MockCommitRequest<>( - new DynamicCommittable(writeTarget, deltaManifest, jobId, operatorId, checkpointId)); - - dynamicCommitter.commit(Sets.newHashSet(commitRequest)); - - CommitRequest oldCommitRequest = - new MockCommitRequest<>( - new DynamicCommittable( - writeTarget, deltaManifest, jobId, operatorId, checkpointId - 1)); - - // Old commits requests shouldn't affect the result - dynamicCommitter.commit(Sets.newHashSet(oldCommitRequest)); - - table1.refresh(); - assertThat(table1.snapshots()).hasSize(1); - Snapshot first = Iterables.getFirst(table1.snapshots(), null); - assertThat(first.summary()) - .containsAllEntriesOf( - (Map) - ImmutableMap.builder() - .put("added-data-files", "1") - .put("added-records", "42") - .put("changed-partition-count", "1") - .put("flink.job-id", jobId) - .put("flink.max-committed-checkpoint-id", "" + checkpointId) - .put("flink.operator-id", operatorId) - .put("total-data-files", "1") - .put("total-delete-files", "0") - .put("total-equality-deletes", "0") - .put("total-files-size", "0") - .put("total-position-deletes", "0") - .put("total-records", "42") - .build()); - } - - @Test - void testReplacePartitions() throws Exception { - Table table1 = catalog.loadTable(TableIdentifier.of(TABLE1)); - assertThat(table1.snapshots()).isEmpty(); - - // Overwrite mode is active - boolean overwriteMode = true; - int workerPoolSize = 1; - String sinkId = "sinkId"; - UnregisteredMetricsGroup metricGroup = new UnregisteredMetricsGroup(); - DynamicCommitterMetrics committerMetrics = new DynamicCommitterMetrics(metricGroup); - DynamicCommitter dynamicCommitter = - new DynamicCommitter( - CATALOG_EXTENSION.catalog(), - Maps.newHashMap(), - overwriteMode, - workerPoolSize, - sinkId, - committerMetrics); - - WriteTarget writeTarget = - new WriteTarget(TABLE1, "branch", 42, 0, false, Sets.newHashSet(1, 2)); - - DynamicWriteResultAggregator aggregator = - new DynamicWriteResultAggregator(CATALOG_EXTENSION.catalogLoader()); - OneInputStreamOperatorTestHarness aggregatorHarness = - new OneInputStreamOperatorTestHarness(aggregator); - aggregatorHarness.open(); - - final String jobId = JobID.generate().toHexString(); - final String operatorId = new OperatorID().toHexString(); - final int checkpointId = 10; - - byte[] deltaManifest = - aggregator.writeToManifest( - writeTarget, - Sets.newHashSet( - new DynamicWriteResult( - writeTarget, WriteResult.builder().addDataFiles(DATA_FILE).build())), - checkpointId); - - CommitRequest commitRequest = - new MockCommitRequest<>( - new DynamicCommittable(writeTarget, deltaManifest, jobId, operatorId, checkpointId)); - - dynamicCommitter.commit(Sets.newHashSet(commitRequest)); - - byte[] overwriteManifest = - aggregator.writeToManifest( - writeTarget, - Sets.newHashSet( - new DynamicWriteResult( - writeTarget, WriteResult.builder().addDataFiles(DATA_FILE).build())), - checkpointId + 1); - - CommitRequest overwriteRequest = - new MockCommitRequest<>( - new DynamicCommittable( - writeTarget, overwriteManifest, jobId, operatorId, checkpointId + 1)); - - dynamicCommitter.commit(Sets.newHashSet(overwriteRequest)); - - table1.refresh(); - assertThat(table1.snapshots()).hasSize(2); - Snapshot latestSnapshot = Iterables.getLast(table1.snapshots()); - assertThat(latestSnapshot.summary()) - .containsAllEntriesOf( - (Map) - ImmutableMap.builder() - .put("replace-partitions", "true") - .put("added-data-files", "1") - .put("added-records", "42") - .put("changed-partition-count", "1") - .put("flink.job-id", jobId) - .put("flink.max-committed-checkpoint-id", String.valueOf(checkpointId + 1)) - .put("flink.operator-id", operatorId) - .put("total-data-files", "1") - .put("total-delete-files", "0") - .put("total-equality-deletes", "0") - .put("total-files-size", "0") - .put("total-position-deletes", "0") - .put("total-records", "42") - .build()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java deleted file mode 100644 index b61e297cc140..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java +++ /dev/null @@ -1,850 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import static org.apache.iceberg.flink.TestFixtures.DATABASE; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.fail; - -import java.io.IOException; -import java.io.Serializable; -import java.time.Duration; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; -import javax.annotation.Nullable; -import org.apache.flink.api.common.typeinfo.TypeHint; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.connector.sink2.Committer; -import org.apache.flink.api.connector.sink2.CommitterInitContext; -import org.apache.flink.api.java.tuple.Tuple2; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.RestartStrategyOptions; -import org.apache.flink.runtime.client.JobExecutionException; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.util.DataFormatConverters; -import org.apache.flink.table.types.DataType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.Row; -import org.apache.flink.util.Collector; -import org.apache.flink.util.ExceptionUtils; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SnapshotUpdate; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.IcebergGenerics; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.CatalogLoader; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.FlinkWriteConf; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.sink.CommitSummary; -import org.apache.iceberg.flink.sink.TestFlinkIcebergSinkBase; -import org.apache.iceberg.inmemory.InMemoryInputFile; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -class TestDynamicIcebergSink extends TestFlinkIcebergSinkBase { - - private static long seed; - - @BeforeEach - void before() { - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(2); - seed = 0; - } - - private static class DynamicIcebergDataImpl implements Serializable { - Row rowProvided; - Row rowExpected; - Schema schemaProvided; - Schema schemaExpected; - String tableName; - String branch; - PartitionSpec partitionSpec; - boolean upsertMode; - Set equalityFields; - - private DynamicIcebergDataImpl( - Schema schemaProvided, String tableName, String branch, PartitionSpec partitionSpec) { - this( - schemaProvided, - schemaProvided, - tableName, - branch, - partitionSpec, - false, - Collections.emptySet(), - false); - } - - private DynamicIcebergDataImpl( - Schema schemaProvided, - Schema schemaExpected, - String tableName, - String branch, - PartitionSpec partitionSpec) { - this( - schemaProvided, - schemaExpected, - tableName, - branch, - partitionSpec, - false, - Collections.emptySet(), - false); - } - - private DynamicIcebergDataImpl( - Schema schemaProvided, - String tableName, - String branch, - PartitionSpec partitionSpec, - boolean upsertMode, - Set equalityFields, - boolean isDuplicate) { - this( - schemaProvided, - schemaProvided, - tableName, - branch, - partitionSpec, - upsertMode, - equalityFields, - isDuplicate); - } - - private DynamicIcebergDataImpl( - Schema schemaProvided, - Schema schemaExpected, - String tableName, - String branch, - PartitionSpec partitionSpec, - boolean upsertMode, - Set equalityFields, - boolean isDuplicate) { - this.rowProvided = randomRow(schemaProvided, isDuplicate ? seed : ++seed); - this.rowExpected = isDuplicate ? null : rowProvided; - this.schemaProvided = schemaProvided; - this.schemaExpected = schemaExpected; - this.tableName = tableName; - this.branch = branch; - this.partitionSpec = partitionSpec; - this.upsertMode = upsertMode; - this.equalityFields = equalityFields; - } - } - - private static class Generator implements DynamicRecordGenerator { - - @Override - public void generate(DynamicIcebergDataImpl row, Collector out) { - TableIdentifier tableIdentifier = TableIdentifier.of(DATABASE, row.tableName); - String branch = row.branch; - Schema schema = row.schemaProvided; - PartitionSpec spec = row.partitionSpec; - DynamicRecord dynamicRecord = - new DynamicRecord( - tableIdentifier, - branch, - schema, - converter(schema).toInternal(row.rowProvided), - spec, - spec.isPartitioned() ? DistributionMode.HASH : DistributionMode.NONE, - 10); - dynamicRecord.setUpsertMode(row.upsertMode); - dynamicRecord.setEqualityFields(row.equalityFields); - out.collect(dynamicRecord); - } - } - - private static DataFormatConverters.RowConverter converter(Schema schema) { - RowType rowType = FlinkSchemaUtil.convert(schema); - ResolvedSchema resolvedSchema = FlinkSchemaUtil.toResolvedSchema(rowType); - return new DataFormatConverters.RowConverter( - resolvedSchema.getColumnDataTypes().toArray(DataType[]::new)); - } - - @Test - void testWrite() throws Exception { - List rows = - Lists.newArrayList( - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned())); - - runTest(rows); - } - - @Test - void testWritePartitioned() throws Exception { - PartitionSpec spec = PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("id", 10).build(); - - List rows = - Lists.newArrayList( - new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec), - new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec), - new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec), - new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec), - new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec)); - - runTest(rows); - } - - @Test - void testWritePartitionedAdjustSchemaIdsInSpec() throws Exception { - Schema schema = - new Schema( - // Use zero-based schema field ids - Types.NestedField.required(0, "id", Types.IntegerType.get()), - Types.NestedField.required(1, "data", Types.StringType.get())); - PartitionSpec spec = PartitionSpec.builderFor(schema).bucket("id", 10).build(); - Schema schema2 = - new Schema( - // Use zero-based schema field ids - Types.NestedField.required(0, "id", Types.IntegerType.get()), - Types.NestedField.required(1, "data", Types.StringType.get()), - Types.NestedField.optional(2, "extra", Types.StringType.get())); - PartitionSpec spec2 = PartitionSpec.builderFor(schema2).bucket("extra", 23).build(); - - List rows = - Lists.newArrayList( - new DynamicIcebergDataImpl(schema, "t1", "main", spec), - new DynamicIcebergDataImpl(schema, "t1", "main", spec), - new DynamicIcebergDataImpl(schema, "t1", "main", spec), - new DynamicIcebergDataImpl(schema2, "t1", "main", spec2), - new DynamicIcebergDataImpl(schema2, "t1", "main", spec2)); - - runTest(rows); - } - - @Test - void testSchemaEvolutionFieldOrderChanges() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.IntegerType.get()), - Types.NestedField.required(1, "data", Types.StringType.get())); - Schema expectedSchema = - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get())); - - Schema schema2 = - new Schema( - Types.NestedField.required(0, "id", Types.IntegerType.get()), - Types.NestedField.required(1, "extra", Types.StringType.get()), - Types.NestedField.required(2, "data", Types.StringType.get())); - Schema expectedSchema2 = - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(3, "extra", Types.StringType.get()), - Types.NestedField.required(2, "data", Types.StringType.get())); - - List rows = - Lists.newArrayList( - new DynamicIcebergDataImpl( - schema, expectedSchema, "t1", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - schema, expectedSchema, "t1", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - schema, expectedSchema, "t1", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - schema2, expectedSchema2, "t1", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - schema2, expectedSchema2, "t1", "main", PartitionSpec.unpartitioned())); - - for (DynamicIcebergDataImpl row : rows) { - if (row.schemaExpected == expectedSchema) { - // We manually adjust the expected Row to match the second expected schema - row.rowExpected = Row.of(row.rowProvided.getField(0), null, row.rowProvided.getField(1)); - } - } - - runTest(rows); - } - - @Test - void testMultipleTables() throws Exception { - List rows = - Lists.newArrayList( - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t2", "main", PartitionSpec.unpartitioned())); - - runTest(rows); - } - - @Test - void testMultipleTablesPartitioned() throws Exception { - PartitionSpec spec = PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("id", 10).build(); - - List rows = - Lists.newArrayList( - new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec), - new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t2", "main", spec)); - - runTest(rows); - } - - @Test - void testSchemaEvolutionAddField() throws Exception { - List rows = - Lists.newArrayList( - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA2, "t1", "main", PartitionSpec.unpartitioned())); - - runTest(rows, this.env, 1); - } - - @Test - void testRowEvolutionNullMissingOptionalField() throws Exception { - List rows = - Lists.newArrayList( - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA2, "t1", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned())); - - runTest(rows, this.env, 1); - } - - @Test - void testRowEvolutionMakeMissingRequiredFieldOptional() throws Exception { - Schema existingSchemaWithRequiredField = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get())); - - CATALOG_EXTENSION - .catalog() - .createTable(TableIdentifier.of(DATABASE, "t1"), existingSchemaWithRequiredField); - - Schema writeSchemaWithoutRequiredField = - new Schema(Types.NestedField.optional(1, "id", Types.IntegerType.get())); - - List rows = - Lists.newArrayList( - new DynamicIcebergDataImpl( - writeSchemaWithoutRequiredField, - existingSchemaWithRequiredField, - "t1", - "main", - PartitionSpec.unpartitioned())); - - runTest(rows, this.env, 1); - } - - @Test - void testSchemaEvolutionNonBackwardsCompatible() throws Exception { - Schema initialSchema = new Schema(Types.NestedField.required(1, "id", Types.IntegerType.get())); - // Type change is not allowed - Schema erroringSchema = new Schema(Types.NestedField.required(1, "id", Types.StringType.get())); - - List rows = - Lists.newArrayList( - new DynamicIcebergDataImpl(initialSchema, "t1", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - erroringSchema, "t1", "main", PartitionSpec.unpartitioned())); - - try { - runTest(rows, StreamExecutionEnvironment.getExecutionEnvironment(), 1); - fail(); - } catch (JobExecutionException e) { - assertThat( - ExceptionUtils.findThrowable( - e, t -> t.getMessage().contains("Cannot change column type: id: int -> string"))) - .isNotEmpty(); - } - } - - @Test - void testPartitionSpecEvolution() throws Exception { - PartitionSpec spec1 = PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("id", 10).build(); - PartitionSpec spec2 = - PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("id", 5).identity("data").build(); - - List rows = - Lists.newArrayList( - new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec1), - new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec2), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec1), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec2), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec1), - new DynamicIcebergDataImpl(SimpleDataUtil.SCHEMA, "t1", "main", spec2)); - - runTest(rows); - } - - @Test - void testMultipleBranches() throws Exception { - List rows = - Lists.newArrayList( - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t1", "branch1", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned())); - - runTest(rows); - } - - @Test - void testWriteMultipleTablesWithSchemaChanges() throws Exception { - List rows = - Lists.newArrayList( - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t2", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA2, "t2", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t2", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA2, "t2", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned())); - - runTest(rows); - } - - @Test - void testUpsert() throws Exception { - List rows = - Lists.newArrayList( - // Insert one rows - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, - "t1", - "main", - PartitionSpec.unpartitioned(), - true, - Sets.newHashSet("id"), - false), - // Remaining rows are duplicates - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, - "t1", - "main", - PartitionSpec.unpartitioned(), - true, - Sets.newHashSet("id"), - true), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, - "t1", - "main", - PartitionSpec.unpartitioned(), - true, - Sets.newHashSet("id"), - true), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, - "t1", - "main", - PartitionSpec.unpartitioned(), - true, - Sets.newHashSet("id"), - true)); - - executeDynamicSink(rows, env, true, 1, null); - - try (CloseableIterable iterable = - IcebergGenerics.read( - CATALOG_EXTENSION.catalog().loadTable(TableIdentifier.of("default", "t1"))) - .build()) { - List records = Lists.newArrayList(); - for (Record record : iterable) { - records.add(record); - } - - assertThat(records).hasSize(1); - Record actual = records.get(0); - DynamicIcebergDataImpl input = rows.get(0); - assertThat(actual.get(0)).isEqualTo(input.rowProvided.getField(0)); - assertThat(actual.get(1)).isEqualTo(input.rowProvided.getField(1)); - // There is an additional _pos field which gets added - } - } - - @Test - void testCommitFailedBeforeOrAfterCommit() throws Exception { - // Configure a Restart strategy to allow recovery - Configuration configuration = new Configuration(); - configuration.set(RestartStrategyOptions.RESTART_STRATEGY, "fixed-delay"); - configuration.set(RestartStrategyOptions.RESTART_STRATEGY_FIXED_DELAY_ATTEMPTS, 2); - configuration.set(RestartStrategyOptions.RESTART_STRATEGY_FIXED_DELAY_DELAY, Duration.ZERO); - env.configure(configuration); - - List rows = - Lists.newArrayList( - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t2", "main", PartitionSpec.unpartitioned())); - - FailBeforeAndAfterCommit.reset(); - final CommitHook commitHook = new FailBeforeAndAfterCommit(); - assertThat(FailBeforeAndAfterCommit.failedBeforeCommit).isFalse(); - assertThat(FailBeforeAndAfterCommit.failedAfterCommit).isFalse(); - - executeDynamicSink(rows, env, true, 1, commitHook); - - assertThat(FailBeforeAndAfterCommit.failedBeforeCommit).isTrue(); - assertThat(FailBeforeAndAfterCommit.failedAfterCommit).isTrue(); - } - - @Test - void testCommitConcurrency() throws Exception { - - List rows = - Lists.newArrayList( - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t1", "main", PartitionSpec.unpartitioned()), - new DynamicIcebergDataImpl( - SimpleDataUtil.SCHEMA, "t2", "main", PartitionSpec.unpartitioned())); - - TableIdentifier tableIdentifier = TableIdentifier.of("default", "t1"); - Catalog catalog = CATALOG_EXTENSION.catalog(); - catalog.createTable(tableIdentifier, new Schema()); - - final CommitHook commitHook = new AppendRightBeforeCommit(tableIdentifier.toString()); - - executeDynamicSink(rows, env, true, 1, commitHook); - } - - interface CommitHook extends Serializable { - void beforeCommit(); - - void duringCommit(); - - void afterCommit(); - } - - private static class FailBeforeAndAfterCommit implements CommitHook { - - static boolean failedBeforeCommit; - static boolean failedAfterCommit; - - @Override - public void beforeCommit() { - if (!failedBeforeCommit) { - failedBeforeCommit = true; - throw new RuntimeException("Failing before commit"); - } - } - - @Override - public void duringCommit() {} - - @Override - public void afterCommit() { - if (!failedAfterCommit) { - failedAfterCommit = true; - throw new RuntimeException("Failing before commit"); - } - } - - static void reset() { - failedBeforeCommit = false; - failedAfterCommit = false; - } - } - - private static class AppendRightBeforeCommit implements CommitHook { - - final String tableIdentifier; - - private AppendRightBeforeCommit(String tableIdentifier) { - this.tableIdentifier = tableIdentifier; - } - - @Override - public void beforeCommit() {} - - @Override - public void duringCommit() { - // Create a conflict - Table table = CATALOG_EXTENSION.catalog().loadTable(TableIdentifier.parse(tableIdentifier)); - DataFile dataFile = - DataFiles.builder(PartitionSpec.unpartitioned()) - .withInputFile(new InMemoryInputFile(new byte[] {1, 2, 3})) - .withFormat(FileFormat.AVRO) - .withRecordCount(3) - .build(); - table.newAppend().appendFile(dataFile).commit(); - } - - @Override - public void afterCommit() {} - } - - private void runTest(List dynamicData) throws Exception { - runTest(dynamicData, this.env, 2); - } - - private void runTest( - List dynamicData, StreamExecutionEnvironment env, int parallelism) - throws Exception { - runTest(dynamicData, env, true, parallelism); - runTest(dynamicData, env, false, parallelism); - } - - private void runTest( - List dynamicData, - StreamExecutionEnvironment env, - boolean immediateUpdate, - int parallelism) - throws Exception { - executeDynamicSink(dynamicData, env, immediateUpdate, parallelism, null); - verifyResults(dynamicData); - } - - private void executeDynamicSink( - List dynamicData, - StreamExecutionEnvironment env, - boolean immediateUpdate, - int parallelism, - @Nullable CommitHook commitHook) - throws Exception { - DataStream dataStream = - env.addSource(createBoundedSource(dynamicData), TypeInformation.of(new TypeHint<>() {})); - env.setParallelism(parallelism); - - if (commitHook != null) { - new CommitHookEnabledDynamicIcebergSink(commitHook) - .forInput(dataStream) - .generator(new Generator()) - .catalogLoader(CATALOG_EXTENSION.catalogLoader()) - .writeParallelism(parallelism) - .immediateTableUpdate(immediateUpdate) - .setSnapshotProperty("commit.retry.num-retries", "0") - .append(); - } else { - DynamicIcebergSink.forInput(dataStream) - .generator(new Generator()) - .catalogLoader(CATALOG_EXTENSION.catalogLoader()) - .writeParallelism(parallelism) - .immediateTableUpdate(immediateUpdate) - .append(); - } - - // Write the data - env.execute("Test Iceberg DataStream"); - } - - static class CommitHookEnabledDynamicIcebergSink extends DynamicIcebergSink.Builder { - private final CommitHook commitHook; - - CommitHookEnabledDynamicIcebergSink(CommitHook commitHook) { - this.commitHook = commitHook; - } - - @Override - DynamicIcebergSink instantiateSink( - Map writeProperties, FlinkWriteConf flinkWriteConf) { - return new CommitHookDynamicIcebergSink( - commitHook, - CATALOG_EXTENSION.catalogLoader(), - Collections.emptyMap(), - "uidPrefix", - writeProperties, - flinkWriteConf, - 100); - } - } - - static class CommitHookDynamicIcebergSink extends DynamicIcebergSink { - - private final CommitHook commitHook; - - CommitHookDynamicIcebergSink( - CommitHook commitHook, - CatalogLoader catalogLoader, - Map snapshotProperties, - String uidPrefix, - Map writeProperties, - FlinkWriteConf flinkWriteConf, - int cacheMaximumSize) { - super( - catalogLoader, - snapshotProperties, - uidPrefix, - writeProperties, - flinkWriteConf, - cacheMaximumSize); - this.commitHook = commitHook; - } - - @Override - public Committer createCommitter(CommitterInitContext context) { - // return super.createCommitter(context); - return new CommitHookEnabledDynamicCommitter( - commitHook, - CATALOG_EXTENSION.catalogLoader().loadCatalog(), - Collections.emptyMap(), - false, - 10, - "sinkId", - new DynamicCommitterMetrics(context.metricGroup())); - } - } - - static class CommitHookEnabledDynamicCommitter extends DynamicCommitter { - private final CommitHook commitHook; - - CommitHookEnabledDynamicCommitter( - CommitHook commitHook, - Catalog catalog, - Map snapshotProperties, - boolean replacePartitions, - int workerPoolSize, - String sinkId, - DynamicCommitterMetrics committerMetrics) { - super( - catalog, snapshotProperties, replacePartitions, workerPoolSize, sinkId, committerMetrics); - this.commitHook = commitHook; - } - - @Override - public void commit(Collection> commitRequests) - throws IOException, InterruptedException { - commitHook.beforeCommit(); - super.commit(commitRequests); - commitHook.afterCommit(); - } - - @Override - void commitOperation( - Table table, - String branch, - SnapshotUpdate operation, - CommitSummary summary, - String description, - String newFlinkJobId, - String operatorId, - long checkpointId) { - commitHook.duringCommit(); - super.commitOperation( - table, branch, operation, summary, description, newFlinkJobId, operatorId, checkpointId); - } - } - - private void verifyResults(List dynamicData) throws IOException { - // Calculate the expected result - Map, List> expectedData = Maps.newHashMap(); - Map expectedSchema = Maps.newHashMap(); - dynamicData.forEach( - r -> { - Schema oldSchema = expectedSchema.get(r.tableName); - if (oldSchema == null || oldSchema.columns().size() < r.schemaProvided.columns().size()) { - expectedSchema.put(r.tableName, r.schemaExpected); - } - }); - - dynamicData.forEach( - r -> { - List data = - expectedData.computeIfAbsent( - Tuple2.of(r.tableName, r.branch), unused -> Lists.newArrayList()); - data.addAll( - convertToRowData(expectedSchema.get(r.tableName), ImmutableList.of(r.rowExpected))); - }); - - // Check the expected result - int count = dynamicData.size(); - for (Map.Entry, List> e : expectedData.entrySet()) { - SimpleDataUtil.assertTableRows( - CATALOG_EXTENSION - .catalogLoader() - .loadCatalog() - .loadTable(TableIdentifier.of(DATABASE, e.getKey().f0)), - e.getValue(), - e.getKey().f1); - count -= e.getValue().size(); - } - - // Found every record - assertThat(count).isZero(); - } - - private List convertToRowData(Schema schema, List rows) { - DataFormatConverters.RowConverter converter = converter(schema); - return rows.stream() - .map( - r -> { - Row updateRow = r; - // We need conversion to generate the missing columns - if (r.getArity() != schema.columns().size()) { - updateRow = new Row(schema.columns().size()); - for (int i = 0; i < r.getArity(); ++i) { - updateRow.setField(i, r.getField(i)); - } - } - return converter.toInternal(updateRow); - }) - .collect(Collectors.toList()); - } - - private static Row randomRow(Schema schema, long seedOverride) { - return TestHelpers.convertRecordToRow( - RandomGenericData.generate(schema, 1, seedOverride), schema) - .get(0); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSinkPerf.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSinkPerf.java deleted file mode 100644 index ae5b2f67120b..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSinkPerf.java +++ /dev/null @@ -1,245 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import static org.apache.iceberg.flink.TestFixtures.DATABASE; -import static org.apache.iceberg.flink.TestFixtures.TABLE; -import static org.apache.iceberg.flink.sink.dynamic.DynamicCommitter.MAX_CONTINUOUS_EMPTY_COMMITS; - -import java.util.List; -import java.util.function.Consumer; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import org.apache.flink.api.common.functions.FlatMapFunction; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; -import org.apache.flink.util.Collector; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.RowDataConverter; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.sink.IcebergSink; -import org.apache.iceberg.flink.source.BoundedTestSource; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Performance test class to compare {@link DynamicIcebergSink} against {@link IcebergSink} to - * measure and compare their throughput. - * - *

    The test dynamically generates input for multiple tables, then writes to these tables. For the - * DynamicSink, a single sink is used to write all tables. For the IcebergSink, one sink is used per - * table. The test logs the written record counts and elapsed time based on the Iceberg snapshot - * metadata. - * - *

    Usage

    - * - *
      - *
    • Set the SAMPLE_SIZE, RECORD_SIZE, and TABLE_NUM. - *
    • Run the unit tests and review logs for performance results. - *
    - * - *

    Note: This test is disabled by default and should be enabled manually when performance testing - * is needed. It is not intended as a standard unit test. - */ -@Disabled("Please enable manually for performance testing.") -class TestDynamicIcebergSinkPerf { - private static final Logger LOG = LoggerFactory.getLogger(TestDynamicIcebergSinkPerf.class); - - @RegisterExtension - protected static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(DATABASE, TABLE); - - private static final int SAMPLE_SIZE = 50_000; - private static final int RECORD_SIZE = 5_000_000; - private static final int TABLE_NUM = 3; - private static final int PARALLELISM = 2; - private static final int WRITE_PARALLELISM = 2; - private static final TableIdentifier[] IDENTIFIERS = new TableIdentifier[TABLE_NUM]; - private static final Schema SCHEMA = - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "name2", Types.StringType.get()), - Types.NestedField.required(3, "name3", Types.StringType.get()), - Types.NestedField.required(4, "name4", Types.StringType.get()), - Types.NestedField.required(5, "name5", Types.StringType.get()), - Types.NestedField.required(6, "name6", Types.StringType.get()), - Types.NestedField.required(7, "name7", Types.StringType.get()), - Types.NestedField.required(8, "name8", Types.StringType.get()), - Types.NestedField.required(9, "name9", Types.StringType.get())); - private static final List RANGE = - IntStream.range(0, RECORD_SIZE).boxed().collect(Collectors.toList()); - - private static List rows; - private StreamExecutionEnvironment env; - - @BeforeEach - void before() { - for (int i = 0; i < TABLE_NUM; ++i) { - // So the table name hash difference is bigger than 1 - IDENTIFIERS[i] = TableIdentifier.of(DATABASE, TABLE + "_" + (i * 13)); - - Table table = - CATALOG_EXTENSION - .catalog() - .createTable( - IDENTIFIERS[i], - SCHEMA, - PartitionSpec.unpartitioned(), - ImmutableMap.of(MAX_CONTINUOUS_EMPTY_COMMITS, "100000")); - - table.manageSnapshots().createBranch("main").commit(); - } - - List records = RandomGenericData.generate(SCHEMA, SAMPLE_SIZE, 1L); - rows = Lists.newArrayListWithCapacity(records.size()); - for (int i = 0; i < records.size(); ++i) { - rows.add( - new DynamicRecord( - IDENTIFIERS[i % TABLE_NUM], - "main", - SCHEMA, - RowDataConverter.convert(SCHEMA, records.get(i)), - PartitionSpec.unpartitioned(), - DistributionMode.NONE, - WRITE_PARALLELISM)); - } - - Configuration configuration = MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; - configuration.setString("rest.flamegraph.enabled", "true"); - env = - StreamExecutionEnvironment.getExecutionEnvironment(configuration) - .enableCheckpointing(100) - .setParallelism(PARALLELISM) - .setMaxParallelism(PARALLELISM); - env.getConfig().enableObjectReuse(); - } - - @AfterEach - void after() { - for (TableIdentifier identifier : IDENTIFIERS) { - CATALOG_EXTENSION.catalog().dropTable(identifier); - } - } - - private static class IdBasedGenerator implements DynamicRecordGenerator { - - @Override - public void generate(Integer id, Collector out) { - out.collect(rows.get(id % SAMPLE_SIZE)); - } - } - - @Test - void testDynamicSink() throws Exception { - // So we make sure that the writer threads are the same for the 2 tests - env.setMaxParallelism(PARALLELISM * TABLE_NUM * 2); - env.setParallelism(PARALLELISM * TABLE_NUM * 2); - runTest( - s -> { - DynamicIcebergSink.forInput(s) - .generator(new IdBasedGenerator()) - .immediateTableUpdate(true) - .catalogLoader(CATALOG_EXTENSION.catalogLoader()) - .append(); - }); - } - - @Test - void testIcebergSink() throws Exception { - runTest( - s -> { - for (int i = 0; i < IDENTIFIERS.length; ++i) { - TableLoader tableLoader = - TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), IDENTIFIERS[i]); - final int finalInt = i; - IcebergSink.forRowData( - s.flatMap( - (FlatMapFunction) - (input, collector) -> { - if (input % TABLE_NUM == finalInt) { - collector.collect(rows.get(input % SAMPLE_SIZE).rowData()); - } - }) - .returns(InternalTypeInfo.of(FlinkSchemaUtil.convert(SCHEMA))) - .rebalance()) - .tableLoader(tableLoader) - .uidSuffix("Uid" + i) - .writeParallelism(WRITE_PARALLELISM) - .append(); - } - }); - } - - private void runTest(Consumer> sink) throws Exception { - DataStream dataStream = - env.addSource( - new BoundedTestSource<>( - ImmutableList.of( - RANGE, RANGE, RANGE, RANGE, RANGE, RANGE, RANGE, RANGE, RANGE, RANGE), - true), - TypeInformation.of(Integer.class)); - - sink.accept(dataStream); - - long before = System.currentTimeMillis(); - env.execute(); - - for (TableIdentifier identifier : IDENTIFIERS) { - Table table = CATALOG_EXTENSION.catalog().loadTable(identifier); - for (Snapshot snapshot : table.snapshots()) { - long records = 0; - for (DataFile dataFile : snapshot.addedDataFiles(table.io())) { - records += dataFile.recordCount(); - } - - LOG.info( - "TEST RESULT: For table {} snapshot {} written {} records in {} ms", - identifier, - snapshot.snapshotId(), - records, - snapshot.timestampMillis() - before); - before = snapshot.timestampMillis(); - } - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchema.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchema.java deleted file mode 100644 index ab8ce98c3594..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchema.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -/** Test writing DynamicRecord with the full schema */ -class TestDynamicRecordInternalSerializerWriteSchema - extends DynamicRecordInternalSerializerTestBase { - - TestDynamicRecordInternalSerializerWriteSchema() { - super(true); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchemaId.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchemaId.java deleted file mode 100644 index 1d8890546214..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordInternalSerializerWriteSchemaId.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -/** Test writing DynamicRecord with only the schema id. */ -class TestDynamicRecordInternalSerializerWriteSchemaId - extends DynamicRecordInternalSerializerTestBase { - - TestDynamicRecordInternalSerializerWriteSchemaId() { - super(false); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java deleted file mode 100644 index 01e2c440df67..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import static org.apache.iceberg.flink.TestFixtures.DATABASE; -import static org.apache.iceberg.flink.TestFixtures.TABLE; -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.Collections; -import org.apache.flink.api.common.functions.OpenContext; -import org.apache.flink.table.data.GenericRowData; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.RegisterExtension; - -class TestDynamicTableUpdateOperator { - - @RegisterExtension - private static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(DATABASE, TABLE); - - private static final Schema SCHEMA1 = - new Schema(Types.NestedField.required(1, "id", Types.IntegerType.get())); - - private static final Schema SCHEMA2 = - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); - - @Test - void testDynamicTableUpdateOperatorNewTable() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; - Catalog catalog = CATALOG_EXTENSION.catalog(); - TableIdentifier table = TableIdentifier.of(TABLE); - - assertThat(catalog.tableExists(table)).isFalse(); - DynamicTableUpdateOperator operator = - new DynamicTableUpdateOperator( - CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize); - operator.open((OpenContext) null); - - DynamicRecordInternal input = - new DynamicRecordInternal( - TABLE, - "branch", - SCHEMA1, - GenericRowData.of(1, "test"), - PartitionSpec.unpartitioned(), - 42, - false, - Collections.emptySet()); - DynamicRecordInternal output = operator.map(input); - - assertThat(catalog.tableExists(table)).isTrue(); - assertThat(input).isEqualTo(output); - } - - @Test - void testDynamicTableUpdateOperatorSchemaChange() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; - Catalog catalog = CATALOG_EXTENSION.catalog(); - TableIdentifier table = TableIdentifier.of(TABLE); - - DynamicTableUpdateOperator operator = - new DynamicTableUpdateOperator( - CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize); - operator.open((OpenContext) null); - - catalog.createTable(table, SCHEMA1); - DynamicRecordInternal input = - new DynamicRecordInternal( - TABLE, - "branch", - SCHEMA2, - GenericRowData.of(1, "test"), - PartitionSpec.unpartitioned(), - 42, - false, - Collections.emptySet()); - DynamicRecordInternal output = operator.map(input); - - assertThat(catalog.loadTable(table).schema().sameSchema(SCHEMA2)).isTrue(); - assertThat(input).isEqualTo(output); - - // Process the same input again - DynamicRecordInternal output2 = operator.map(input); - assertThat(output2).isEqualTo(output); - assertThat(catalog.loadTable(table).schema().schemaId()).isEqualTo(output.schema().schemaId()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultAggregator.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultAggregator.java deleted file mode 100644 index 713c67da170a..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultAggregator.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; -import org.apache.flink.streaming.api.connector.sink2.CommittableWithLineage; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.hadoop.util.Sets; -import org.apache.iceberg.Schema; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.io.WriteResult; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.RegisterExtension; - -class TestDynamicWriteResultAggregator { - - @RegisterExtension - static final HadoopCatalogExtension CATALOG_EXTENSION = new HadoopCatalogExtension("db", "table"); - - @Test - void testAggregator() throws Exception { - CATALOG_EXTENSION.catalog().createTable(TableIdentifier.of("table"), new Schema()); - CATALOG_EXTENSION.catalog().createTable(TableIdentifier.of("table2"), new Schema()); - - DynamicWriteResultAggregator aggregator = - new DynamicWriteResultAggregator(CATALOG_EXTENSION.catalogLoader()); - try (OneInputStreamOperatorTestHarness< - CommittableMessage, CommittableMessage> - testHarness = new OneInputStreamOperatorTestHarness<>(aggregator)) { - testHarness.open(); - - WriteTarget writeTarget1 = new WriteTarget("table", "branch", 42, 0, true, Sets.newHashSet()); - DynamicWriteResult dynamicWriteResult1 = - new DynamicWriteResult(writeTarget1, WriteResult.builder().build()); - WriteTarget writeTarget2 = - new WriteTarget("table2", "branch", 42, 0, true, Sets.newHashSet(1, 2)); - DynamicWriteResult dynamicWriteResult2 = - new DynamicWriteResult(writeTarget2, WriteResult.builder().build()); - - CommittableWithLineage committable1 = - new CommittableWithLineage<>(dynamicWriteResult1, 0, 0); - StreamRecord> record1 = - new StreamRecord<>(committable1); - testHarness.processElement(record1); - CommittableWithLineage committable2 = - new CommittableWithLineage<>(dynamicWriteResult2, 0, 0); - StreamRecord> record2 = - new StreamRecord<>(committable2); - testHarness.processElement(record2); - - assertThat(testHarness.getOutput()).isEmpty(); - - testHarness.prepareSnapshotPreBarrier(1L); - // Contains a CommittableSummary + DynamicCommittable - assertThat(testHarness.getRecordOutput()).hasSize(3); - - testHarness.prepareSnapshotPreBarrier(2L); - // Only contains a CommittableSummary - assertThat(testHarness.getRecordOutput()).hasSize(4); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultSerializer.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultSerializer.java deleted file mode 100644 index a3a9691107eb..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriteResultSerializer.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.IOException; -import java.nio.ByteBuffer; -import org.apache.hadoop.util.Sets; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.Metrics; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.Test; - -class TestDynamicWriteResultSerializer { - - private static final DataFile DATA_FILE = - DataFiles.builder(PartitionSpec.unpartitioned()) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(0) - .withMetrics( - new Metrics( - 42L, - null, - ImmutableMap.of(1, 5L), - ImmutableMap.of(1, 0L), - null, - ImmutableMap.of(1, ByteBuffer.allocate(1)), - ImmutableMap.of(1, ByteBuffer.allocate(1)))) - .build(); - - @Test - void testRoundtrip() throws IOException { - DynamicWriteResult dynamicWriteResult = - new DynamicWriteResult( - new WriteTarget("table", "branch", 42, 23, false, Sets.newHashSet(1, 2)), - WriteResult.builder().addDataFiles(DATA_FILE).build()); - - DynamicWriteResultSerializer serializer = new DynamicWriteResultSerializer(); - DynamicWriteResult copy = - serializer.deserialize(serializer.getVersion(), serializer.serialize(dynamicWriteResult)); - - assertThat(copy.writeResult().dataFiles()).hasSize(1); - DataFile dataFile = copy.writeResult().dataFiles()[0]; - // DataFile doesn't implement equals, but we can still do basic checks - assertThat(dataFile.path()).isEqualTo("/path/to/data-1.parquet"); - assertThat(dataFile.recordCount()).isEqualTo(42L); - } - - @Test - void testUnsupportedVersion() throws IOException { - DynamicWriteResult dynamicWriteResult = - new DynamicWriteResult( - new WriteTarget("table", "branch", 42, 23, false, Sets.newHashSet(1, 2)), - WriteResult.builder().addDataFiles(DATA_FILE).build()); - - DynamicWriteResultSerializer serializer = new DynamicWriteResultSerializer(); - assertThatThrownBy(() -> serializer.deserialize(-1, serializer.serialize(dynamicWriteResult))) - .hasMessage("Unrecognized version or corrupt state: -1") - .isInstanceOf(IOException.class); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriter.java deleted file mode 100644 index 42875982a000..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriter.java +++ /dev/null @@ -1,254 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.File; -import java.net.URI; -import java.util.Collection; -import java.util.Map; -import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.common.DynFields; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.sink.FlinkAppenderFactory; -import org.apache.iceberg.flink.sink.TestFlinkIcebergSinkBase; -import org.apache.iceberg.io.BaseTaskWriter; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.jetbrains.annotations.NotNull; -import org.junit.jupiter.api.Test; - -class TestDynamicWriter extends TestFlinkIcebergSinkBase { - - private static final TableIdentifier TABLE1 = TableIdentifier.of("myTable1"); - private static final TableIdentifier TABLE2 = TableIdentifier.of("myTable2"); - - @Test - void testDynamicWriter() throws Exception { - Catalog catalog = CATALOG_EXTENSION.catalog(); - Table table1 = catalog.createTable(TABLE1, SimpleDataUtil.SCHEMA); - Table table2 = catalog.createTable(TABLE2, SimpleDataUtil.SCHEMA); - - DynamicWriter dynamicWriter = createDynamicWriter(catalog); - - DynamicRecordInternal record1 = getDynamicRecordInternal(table1); - DynamicRecordInternal record2 = getDynamicRecordInternal(table2); - - assertThat(getNumDataFiles(table1)).isEqualTo(0); - - dynamicWriter.write(record1, null); - dynamicWriter.write(record2, null); - Collection writeResults = dynamicWriter.prepareCommit(); - - assertThat(writeResults).hasSize(2); - assertThat(getNumDataFiles(table1)).isEqualTo(1); - assertThat( - dynamicWriter - .getMetrics() - .writerMetrics(TABLE1.name()) - .getFlushedDataFiles() - .getCount()) - .isEqualTo(1); - assertThat( - dynamicWriter - .getMetrics() - .writerMetrics(TABLE2.name()) - .getFlushedDataFiles() - .getCount()) - .isEqualTo(1); - - WriteResult wr1 = writeResults.iterator().next().writeResult(); - assertThat(wr1.dataFiles().length).isEqualTo(1); - assertThat(wr1.dataFiles()[0].format()).isEqualTo(FileFormat.PARQUET); - assertThat(wr1.deleteFiles()).isEmpty(); - - dynamicWriter.write(record1, null); - dynamicWriter.write(record2, null); - writeResults = dynamicWriter.prepareCommit(); - - assertThat(writeResults).hasSize(2); - assertThat(getNumDataFiles(table1)).isEqualTo(2); - assertThat( - dynamicWriter - .getMetrics() - .writerMetrics(TABLE1.name()) - .getFlushedDataFiles() - .getCount()) - .isEqualTo(2); - assertThat( - dynamicWriter - .getMetrics() - .writerMetrics(TABLE2.name()) - .getFlushedDataFiles() - .getCount()) - .isEqualTo(2); - - WriteResult wr2 = writeResults.iterator().next().writeResult(); - assertThat(wr2.dataFiles().length).isEqualTo(1); - assertThat(wr2.dataFiles()[0].format()).isEqualTo(FileFormat.PARQUET); - assertThat(wr2.deleteFiles()).isEmpty(); - - dynamicWriter.close(); - } - - @Test - void testDynamicWriterPropertiesDefault() throws Exception { - Catalog catalog = CATALOG_EXTENSION.catalog(); - Table table1 = - catalog.createTable( - TABLE1, - SimpleDataUtil.SCHEMA, - null, - ImmutableMap.of("write.parquet.compression-codec", "zstd")); - - DynamicWriter dynamicWriter = createDynamicWriter(catalog); - DynamicRecordInternal record1 = getDynamicRecordInternal(table1); - - assertThat(getNumDataFiles(table1)).isEqualTo(0); - - dynamicWriter.write(record1, null); - Map properties = properties(dynamicWriter); - assertThat(properties).containsEntry("write.parquet.compression-codec", "zstd"); - - dynamicWriter.close(); - } - - @Test - void testDynamicWriterPropertiesPriority() throws Exception { - Catalog catalog = CATALOG_EXTENSION.catalog(); - Table table1 = - catalog.createTable( - TABLE1, - SimpleDataUtil.SCHEMA, - null, - ImmutableMap.of("write.parquet.compression-codec", "zstd")); - - DynamicWriter dynamicWriter = - createDynamicWriter(catalog, ImmutableMap.of("write.parquet.compression-codec", "gzip")); - DynamicRecordInternal record1 = getDynamicRecordInternal(table1); - - assertThat(getNumDataFiles(table1)).isEqualTo(0); - - dynamicWriter.write(record1, null); - Map properties = properties(dynamicWriter); - assertThat(properties).containsEntry("write.parquet.compression-codec", "gzip"); - - dynamicWriter.close(); - } - - @Test - void testDynamicWriterUpsert() throws Exception { - Catalog catalog = CATALOG_EXTENSION.catalog(); - DynamicWriter dyamicWriter = createDynamicWriter(catalog); - Table table1 = CATALOG_EXTENSION.catalog().createTable(TABLE1, SimpleDataUtil.SCHEMA); - - DynamicRecordInternal record = getDynamicRecordInternal(table1); - record.setUpsertMode(true); - record.setEqualityFieldIds(Sets.newHashSet(1)); - - dyamicWriter.write(record, null); - dyamicWriter.prepareCommit(); - - assertThat( - dyamicWriter - .getMetrics() - .writerMetrics(TABLE1.name()) - .getFlushedDeleteFiles() - .getCount()) - .isEqualTo(1); - assertThat( - dyamicWriter.getMetrics().writerMetrics(TABLE1.name()).getFlushedDataFiles().getCount()) - .isEqualTo(1); - } - - @Test - void testDynamicWriterUpsertNoEqualityFields() { - Catalog catalog = CATALOG_EXTENSION.catalog(); - DynamicWriter dyamicWriter = createDynamicWriter(catalog); - Table table1 = CATALOG_EXTENSION.catalog().createTable(TABLE1, SimpleDataUtil.SCHEMA); - - DynamicRecordInternal record = getDynamicRecordInternal(table1); - record.setUpsertMode(true); - - assertThatThrownBy(() -> dyamicWriter.write(record, null)) - .isInstanceOf(IllegalStateException.class) - .hasMessage( - "Equality field columns shouldn't be empty when configuring to use UPSERT data."); - } - - private static @NotNull DynamicWriter createDynamicWriter( - Catalog catalog, Map properties) { - DynamicWriter dynamicWriter = - new DynamicWriter( - catalog, - FileFormat.PARQUET, - 1024L, - properties, - 100, - new DynamicWriterMetrics(new UnregisteredMetricsGroup()), - 0, - 0); - return dynamicWriter; - } - - private static @NotNull DynamicWriter createDynamicWriter(Catalog catalog) { - return createDynamicWriter(catalog, Map.of()); - } - - private static @NotNull DynamicRecordInternal getDynamicRecordInternal(Table table1) { - DynamicRecordInternal record = new DynamicRecordInternal(); - record.setTableName(TableIdentifier.parse(table1.name()).name()); - record.setSchema(table1.schema()); - record.setSpec(table1.spec()); - record.setRowData(SimpleDataUtil.createRowData(1, "test")); - return record; - } - - private static int getNumDataFiles(Table table) { - File dataDir = new File(URI.create(table.location()).getPath(), "data"); - if (dataDir.exists()) { - return dataDir.listFiles((dir, name) -> !name.startsWith(".")).length; - } - return 0; - } - - private Map properties(DynamicWriter dynamicWriter) { - DynFields.BoundField>> writerField = - DynFields.builder().hiddenImpl(dynamicWriter.getClass(), "writers").build(dynamicWriter); - - DynFields.BoundField appenderField = - DynFields.builder() - .hiddenImpl(BaseTaskWriter.class, "appenderFactory") - .build(writerField.get().values().iterator().next()); - DynFields.BoundField> propsField = - DynFields.builder() - .hiddenImpl(FlinkAppenderFactory.class, "props") - .build(appenderField.get()); - return propsField.get(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestEvolveSchemaVisitor.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestEvolveSchemaVisitor.java deleted file mode 100644 index d416e7ec1fc6..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestEvolveSchemaVisitor.java +++ /dev/null @@ -1,626 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import static org.apache.iceberg.types.Types.NestedField.of; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.lang.reflect.Constructor; -import java.util.Arrays; -import java.util.List; -import java.util.concurrent.atomic.AtomicInteger; -import org.apache.iceberg.Schema; -import org.apache.iceberg.UpdateSchema; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type.PrimitiveType; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.types.Types.DecimalType; -import org.apache.iceberg.types.Types.DoubleType; -import org.apache.iceberg.types.Types.FloatType; -import org.apache.iceberg.types.Types.IntegerType; -import org.apache.iceberg.types.Types.ListType; -import org.apache.iceberg.types.Types.LongType; -import org.apache.iceberg.types.Types.MapType; -import org.apache.iceberg.types.Types.StringType; -import org.apache.iceberg.types.Types.StructType; -import org.apache.iceberg.types.Types.TimeType; -import org.apache.iceberg.types.Types.UUIDType; -import org.junit.jupiter.api.Test; - -public class TestEvolveSchemaVisitor { - - private static List primitiveTypes() { - return Lists.newArrayList( - StringType.get(), - TimeType.get(), - Types.TimestampType.withoutZone(), - Types.TimestampType.withZone(), - UUIDType.get(), - Types.DateType.get(), - Types.BooleanType.get(), - Types.BinaryType.get(), - DoubleType.get(), - IntegerType.get(), - Types.FixedType.ofLength(10), - DecimalType.of(10, 2), - LongType.get(), - FloatType.get()); - } - - private static Types.NestedField[] primitiveFields( - Integer initialValue, List primitiveTypes) { - return primitiveFields(initialValue, primitiveTypes, true); - } - - private static Types.NestedField[] primitiveFields( - Integer initialValue, List primitiveTypes, boolean optional) { - AtomicInteger atomicInteger = new AtomicInteger(initialValue); - return primitiveTypes.stream() - .map( - type -> - of( - atomicInteger.incrementAndGet(), - optional, - type.toString(), - Types.fromPrimitiveString(type.toString()))) - .toArray(Types.NestedField[]::new); - } - - @Test - public void testAddTopLevelPrimitives() { - Schema targetSchema = new Schema(primitiveFields(0, primitiveTypes())); - UpdateSchema updateApi = loadUpdateApi(new Schema()); - EvolveSchemaVisitor.visit(updateApi, new Schema(), targetSchema); - assertThat(targetSchema.asStruct()).isEqualTo(updateApi.apply().asStruct()); - } - - @Test - public void testMakeTopLevelPrimitivesOptional() { - Schema existingSchema = new Schema(primitiveFields(0, primitiveTypes(), false)); - assertThat(existingSchema.columns().stream().allMatch(Types.NestedField::isRequired)).isTrue(); - - UpdateSchema updateApi = loadUpdateApi(existingSchema); - EvolveSchemaVisitor.visit(updateApi, existingSchema, new Schema()); - Schema newSchema = updateApi.apply(); - assertThat(newSchema.asStruct().fields()).hasSize(14); - assertThat(newSchema.columns().stream().allMatch(Types.NestedField::isOptional)).isTrue(); - } - - @Test - public void testIdentifyFieldsByName() { - Schema existingSchema = - new Schema(Types.NestedField.optional(42, "myField", Types.LongType.get())); - UpdateSchema updateApi = loadUpdateApi(existingSchema); - Schema newSchema = - new Schema(Arrays.asList(Types.NestedField.optional(-1, "myField", Types.LongType.get()))); - EvolveSchemaVisitor.visit(updateApi, existingSchema, newSchema); - assertThat(updateApi.apply().sameSchema(existingSchema)).isTrue(); - } - - @Test - public void testChangeOrderTopLevelPrimitives() { - Schema existingSchema = - new Schema( - Arrays.asList(optional(1, "a", StringType.get()), optional(2, "b", StringType.get()))); - Schema targetSchema = - new Schema( - Arrays.asList(optional(2, "b", StringType.get()), optional(1, "a", StringType.get()))); - UpdateSchema updateApi = loadUpdateApi(existingSchema); - EvolveSchemaVisitor.visit(updateApi, existingSchema, targetSchema); - assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); - } - - @Test - public void testAddTopLevelListOfPrimitives() { - for (PrimitiveType primitiveType : primitiveTypes()) { - Schema targetSchema = new Schema(optional(1, "aList", ListType.ofOptional(2, primitiveType))); - UpdateSchema updateApi = loadUpdateApi(new Schema()); - EvolveSchemaVisitor.visit(updateApi, new Schema(), targetSchema); - assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); - } - } - - @Test - public void testMakeTopLevelListOfPrimitivesOptional() { - for (PrimitiveType primitiveType : primitiveTypes()) { - Schema existingSchema = - new Schema(optional(1, "aList", ListType.ofRequired(2, primitiveType))); - Schema targetSchema = new Schema(); - UpdateSchema updateApi = loadUpdateApi(existingSchema); - EvolveSchemaVisitor.visit(updateApi, existingSchema, targetSchema); - Schema expectedSchema = - new Schema(optional(1, "aList", ListType.ofRequired(2, primitiveType))); - assertThat(updateApi.apply().asStruct()).isEqualTo(expectedSchema.asStruct()); - } - } - - @Test - public void testAddTopLevelMapOfPrimitives() { - for (PrimitiveType primitiveType : primitiveTypes()) { - Schema targetSchema = - new Schema(optional(1, "aMap", MapType.ofOptional(2, 3, primitiveType, primitiveType))); - UpdateSchema updateApi = loadUpdateApi(new Schema()); - EvolveSchemaVisitor.visit(updateApi, new Schema(), targetSchema); - assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); - } - } - - @Test - public void testAddTopLevelStructOfPrimitives() { - for (PrimitiveType primitiveType : primitiveTypes()) { - Schema currentSchema = - new Schema( - optional(1, "aStruct", StructType.of(optional(2, "primitive", primitiveType)))); - UpdateSchema updateApi = loadUpdateApi(new Schema()); - EvolveSchemaVisitor.visit(updateApi, new Schema(), currentSchema); - assertThat(updateApi.apply().asStruct()).isEqualTo(currentSchema.asStruct()); - } - } - - @Test - public void testAddNestedPrimitive() { - for (PrimitiveType primitiveType : primitiveTypes()) { - Schema currentSchema = new Schema(optional(1, "aStruct", StructType.of())); - Schema targetSchema = - new Schema( - optional(1, "aStruct", StructType.of(optional(2, "primitive", primitiveType)))); - UpdateSchema updateApi = loadUpdateApi(currentSchema); - EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); - assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); - } - } - - @Test - public void testMakeNestedPrimitiveOptional() { - for (PrimitiveType primitiveType : primitiveTypes()) { - Schema currentSchema = - new Schema( - optional(1, "aStruct", StructType.of(required(2, "primitive", primitiveType)))); - Schema targetSchema = - new Schema( - optional(1, "aStruct", StructType.of(optional(2, "primitive", primitiveType)))); - UpdateSchema updateApi = loadUpdateApi(currentSchema); - EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); - assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); - } - } - - @Test - public void testAddNestedPrimitives() { - Schema currentSchema = new Schema(optional(1, "aStruct", StructType.of())); - Schema targetSchema = - new Schema(optional(1, "aStruct", StructType.of(primitiveFields(1, primitiveTypes())))); - UpdateSchema updateApi = loadUpdateApi(currentSchema); - EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); - assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); - } - - @Test - public void testAddNestedLists() { - Schema targetSchema = - new Schema( - optional( - 1, - "aList", - ListType.ofOptional( - 2, - ListType.ofOptional( - 3, - ListType.ofOptional( - 4, - ListType.ofOptional( - 5, - ListType.ofOptional( - 6, - ListType.ofOptional( - 7, - ListType.ofOptional( - 8, - ListType.ofOptional( - 9, - ListType.ofOptional( - 10, DecimalType.of(11, 20)))))))))))); - UpdateSchema updateApi = loadUpdateApi(new Schema()); - EvolveSchemaVisitor.visit(updateApi, new Schema(), targetSchema); - assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); - } - - @Test - public void testAddNestedStruct() { - Schema currentSchema = - new Schema(optional(1, "struct1", StructType.of(optional(2, "struct2", StructType.of())))); - Schema targetSchema = - new Schema( - optional( - 1, - "struct1", - StructType.of( - optional( - 2, - "struct2", - StructType.of( - optional( - 3, - "struct3", - StructType.of( - optional( - 4, - "struct4", - StructType.of( - optional( - 5, - "struct5", - StructType.of( - optional( - 6, - "struct6", - StructType.of( - optional( - 7, - "aString", - StringType.get())))))))))))))); - UpdateSchema updateApi = loadUpdateApi(currentSchema); - EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); - assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); - } - - @Test - public void testAddNestedMaps() { - Schema targetSchema = - new Schema( - optional( - 1, - "struct", - MapType.ofOptional( - 2, - 3, - StringType.get(), - MapType.ofOptional( - 4, - 5, - StringType.get(), - MapType.ofOptional( - 6, - 7, - StringType.get(), - MapType.ofOptional( - 8, - 9, - StringType.get(), - MapType.ofOptional( - 10, - 11, - StringType.get(), - MapType.ofOptional( - 12, 13, StringType.get(), StringType.get())))))))); - - UpdateSchema updateApi = loadUpdateApi(new Schema()); - EvolveSchemaVisitor.visit(updateApi, new Schema(), targetSchema); - assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); - } - - @Test - public void testDetectInvalidTopLevelList() { - Schema currentSchema = - new Schema(optional(1, "aList", ListType.ofOptional(2, StringType.get()))); - Schema targetSchema = new Schema(optional(1, "aList", ListType.ofOptional(2, LongType.get()))); - assertThatThrownBy( - () -> - EvolveSchemaVisitor.visit( - loadUpdateApi(currentSchema), currentSchema, targetSchema)) - .hasMessage("Cannot change column type: aList.element: string -> long") - .isInstanceOf(IllegalArgumentException.class); - } - - @Test - public void testDetectInvalidTopLevelMapValue() { - - Schema currentSchema = - new Schema( - optional(1, "aMap", MapType.ofOptional(2, 3, StringType.get(), StringType.get()))); - Schema targetSchema = - new Schema(optional(1, "aMap", MapType.ofOptional(2, 3, StringType.get(), LongType.get()))); - - assertThatThrownBy( - () -> - EvolveSchemaVisitor.visit( - loadUpdateApi(currentSchema), currentSchema, targetSchema)) - .hasMessage("Cannot change column type: aMap.value: string -> long") - .isInstanceOf(IllegalArgumentException.class); - } - - @Test - public void testDetectInvalidTopLevelMapKey() { - Schema currentSchema = - new Schema( - optional(1, "aMap", MapType.ofOptional(2, 3, StringType.get(), StringType.get()))); - Schema targetSchema = - new Schema(optional(1, "aMap", MapType.ofOptional(2, 3, UUIDType.get(), StringType.get()))); - assertThatThrownBy( - () -> - EvolveSchemaVisitor.visit( - loadUpdateApi(currentSchema), currentSchema, targetSchema)) - .hasMessage("Cannot change column type: aMap.key: string -> uuid") - .isInstanceOf(IllegalArgumentException.class); - } - - @Test - // int 32-bit signed integers -> Can promote to long - public void testTypePromoteIntegerToLong() { - Schema currentSchema = new Schema(required(1, "aCol", IntegerType.get())); - Schema targetSchema = new Schema(required(1, "aCol", LongType.get())); - - UpdateSchema updateApi = loadUpdateApi(currentSchema); - EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); - Schema applied = updateApi.apply(); - assertThat(applied.asStruct().fields()).hasSize(1); - assertThat(applied.asStruct().fields().get(0).type()).isEqualTo(LongType.get()); - } - - @Test - // float 32-bit IEEE 754 floating point -> Can promote to double - public void testTypePromoteFloatToDouble() { - Schema currentSchema = new Schema(required(1, "aCol", FloatType.get())); - Schema targetSchema = new Schema(required(1, "aCol", DoubleType.get())); - - UpdateSchema updateApi = loadUpdateApi(currentSchema); - EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); - Schema applied = updateApi.apply(); - assertThat(applied.asStruct().fields()).hasSize(1); - assertThat(applied.asStruct().fields().get(0).type()).isEqualTo(DoubleType.get()); - } - - @Test - public void testInvalidTypePromoteDoubleToFloat() { - Schema currentSchema = new Schema(required(1, "aCol", DoubleType.get())); - Schema targetSchema = new Schema(required(1, "aCol", FloatType.get())); - assertThatThrownBy( - () -> - EvolveSchemaVisitor.visit( - loadUpdateApi(currentSchema), currentSchema, targetSchema)) - .hasMessage("Cannot change column type: aCol: double -> float") - .isInstanceOf(IllegalArgumentException.class); - } - - @Test - // decimal(P,S) Fixed-point decimal; precision P, scale S -> Scale is fixed [1], precision must be - // 38 or less - public void testTypePromoteDecimalToFixedScaleWithWiderPrecision() { - Schema currentSchema = new Schema(required(1, "aCol", DecimalType.of(20, 1))); - Schema targetSchema = new Schema(required(1, "aCol", DecimalType.of(22, 1))); - - UpdateSchema updateApi = loadUpdateApi(currentSchema); - EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); - assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); - } - - @Test - public void testAddPrimitiveToNestedStruct() { - Schema existingSchema = - new Schema( - required( - 1, - "struct1", - StructType.of( - optional( - 2, - "struct2", - StructType.of( - optional( - 3, - "list", - ListType.ofOptional( - 4, - StructType.of(optional(5, "number", IntegerType.get()))))))))); - - Schema targetSchema = - new Schema( - required( - 1, - "struct1", - StructType.of( - optional( - 2, - "struct2", - StructType.of( - optional( - 3, - "list", - ListType.ofOptional( - 4, - StructType.of( - optional(5, "number", LongType.get()), - optional(6, "time", TimeType.get()))))))))); - - UpdateSchema updateApi = loadUpdateApi(existingSchema); - EvolveSchemaVisitor.visit(updateApi, existingSchema, targetSchema); - assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); - } - - @Test - public void testReplaceListWithPrimitive() { - Schema currentSchema = - new Schema(optional(1, "aColumn", ListType.ofOptional(2, StringType.get()))); - Schema targetSchema = new Schema(optional(1, "aColumn", StringType.get())); - assertThatThrownBy( - () -> - EvolveSchemaVisitor.visit( - loadUpdateApi(currentSchema), currentSchema, targetSchema)) - .hasMessage("Cannot change column type: aColumn: list -> string") - .isInstanceOf(IllegalArgumentException.class); - } - - @Test - public void addNewTopLevelStruct() { - Schema currentSchema = - new Schema( - optional( - 1, - "map1", - MapType.ofOptional( - 2, - 3, - StringType.get(), - ListType.ofOptional( - 4, StructType.of(optional(5, "string1", StringType.get())))))); - - Schema targetSchema = - new Schema( - optional( - 1, - "map1", - MapType.ofOptional( - 2, - 3, - StringType.get(), - ListType.ofOptional( - 4, StructType.of(optional(5, "string1", StringType.get()))))), - optional( - 6, - "struct1", - StructType.of( - optional(7, "d1", StructType.of(optional(8, "d2", StringType.get())))))); - - UpdateSchema updateApi = loadUpdateApi(currentSchema); - EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); - assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); - } - - @Test - public void testAppendNestedStruct() { - Schema currentSchema = - new Schema( - required( - 1, - "s1", - StructType.of( - optional( - 2, - "s2", - StructType.of( - optional( - 3, "s3", StructType.of(optional(4, "s4", StringType.get())))))))); - - Schema targetSchema = - new Schema( - required( - 1, - "s1", - StructType.of( - optional( - 2, - "s2", - StructType.of( - optional(3, "s3", StructType.of(optional(4, "s4", StringType.get()))), - optional( - 5, - "repeat", - StructType.of( - optional( - 6, - "s1", - StructType.of( - optional( - 7, - "s2", - StructType.of( - optional( - 8, - "s3", - StructType.of( - optional( - 9, - "s4", - StringType.get())))))))))))))); - - UpdateSchema updateApi = loadUpdateApi(currentSchema); - EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); - assertThat(updateApi.apply().asStruct()).isEqualTo(targetSchema.asStruct()); - } - - @Test - public void testMakeNestedStructOptional() { - Schema currentSchema = getNestedSchemaWithOptionalModifier(false); - Schema targetSchema = - new Schema( - required( - 1, - "s1", - StructType.of( - optional( - 2, - "s2", - StructType.of( - optional( - 3, "s3", StructType.of(optional(4, "s4", StringType.get())))))))); - UpdateSchema updateApi = loadUpdateApi(currentSchema); - EvolveSchemaVisitor.visit(updateApi, currentSchema, targetSchema); - assertThat(getNestedSchemaWithOptionalModifier(true).asStruct()) - .isEqualTo(updateApi.apply().asStruct()); - } - - private static Schema getNestedSchemaWithOptionalModifier(boolean nestedIsOptional) { - return new Schema( - required( - 1, - "s1", - StructType.of( - optional( - 2, - "s2", - StructType.of( - optional(3, "s3", StructType.of(optional(4, "s4", StringType.get()))), - of( - 5, - nestedIsOptional, - "repeat", - StructType.of( - optional( - 6, - "s1", - StructType.of( - optional( - 7, - "s2", - StructType.of( - optional( - 8, - "s3", - StructType.of( - optional( - 9, "s4", StringType.get())))))))))))))); - } - - private static UpdateSchema loadUpdateApi(Schema schema) { - try { - Constructor constructor = - TestEvolveSchemaVisitor.class - .getClassLoader() - .loadClass("org.apache.iceberg.SchemaUpdate") - .getDeclaredConstructor(Schema.class, int.class); - constructor.setAccessible(true); - return (UpdateSchema) constructor.newInstance(schema, schema.highestFieldId()); - } catch (Exception e) { - throw new RuntimeException("Failed to instantiate SchemaUpdate class", e); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java deleted file mode 100644 index 8d559e920620..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java +++ /dev/null @@ -1,350 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.Collections; -import java.util.Map; -import java.util.Set; -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.runtime.state.KeyGroupRangeAssignment; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; - -class TestHashKeyGenerator { - - private static final Schema SCHEMA = - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get())); - - private static final String BRANCH = "main"; - private static final TableIdentifier TABLE_IDENTIFIER = TableIdentifier.of("default", "table"); - - @Test - void testRoundRobinWithDistributionModeNone() throws Exception { - int writeParallelism = 10; - int maxWriteParallelism = 2; - HashKeyGenerator generator = new HashKeyGenerator(1, maxWriteParallelism); - PartitionSpec spec = PartitionSpec.unpartitioned(); - - GenericRowData row = GenericRowData.of(1, StringData.fromString("z")); - int writeKey1 = - getWriteKey( - generator, spec, DistributionMode.NONE, writeParallelism, Collections.emptySet(), row); - int writeKey2 = - getWriteKey( - generator, spec, DistributionMode.NONE, writeParallelism, Collections.emptySet(), row); - int writeKey3 = - getWriteKey( - generator, spec, DistributionMode.NONE, writeParallelism, Collections.emptySet(), row); - int writeKey4 = - getWriteKey( - generator, spec, DistributionMode.NONE, writeParallelism, Collections.emptySet(), row); - - assertThat(writeKey1).isNotEqualTo(writeKey2); - assertThat(writeKey3).isEqualTo(writeKey1); - assertThat(writeKey4).isEqualTo(writeKey2); - - assertThat(getSubTaskId(writeKey1, writeParallelism, maxWriteParallelism)).isEqualTo(0); - assertThat(getSubTaskId(writeKey2, writeParallelism, maxWriteParallelism)).isEqualTo(5); - assertThat(getSubTaskId(writeKey3, writeParallelism, maxWriteParallelism)).isEqualTo(0); - assertThat(getSubTaskId(writeKey4, writeParallelism, maxWriteParallelism)).isEqualTo(5); - } - - @Test - void testBucketingWithDistributionModeHash() throws Exception { - int writeParallelism = 3; - int maxWriteParallelism = 8; - HashKeyGenerator generator = new HashKeyGenerator(1, maxWriteParallelism); - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); - - GenericRowData row1 = GenericRowData.of(1, StringData.fromString("a")); - GenericRowData row2 = GenericRowData.of(1, StringData.fromString("b")); - GenericRowData row3 = GenericRowData.of(2, StringData.fromString("c")); - GenericRowData row4 = GenericRowData.of(2, StringData.fromString("d")); - - int writeKey1 = - getWriteKey( - generator, spec, DistributionMode.HASH, writeParallelism, Collections.emptySet(), row1); - int writeKey2 = - getWriteKey( - generator, spec, DistributionMode.HASH, writeParallelism, Collections.emptySet(), row2); - int writeKey3 = - getWriteKey( - generator, spec, DistributionMode.HASH, writeParallelism, Collections.emptySet(), row3); - int writeKey4 = - getWriteKey( - generator, spec, DistributionMode.HASH, writeParallelism, Collections.emptySet(), row4); - - assertThat(writeKey1).isEqualTo(writeKey2); - assertThat(writeKey3).isNotEqualTo(writeKey1); - assertThat(writeKey4).isEqualTo(writeKey3); - - assertThat(getSubTaskId(writeKey1, writeParallelism, maxWriteParallelism)).isEqualTo(0); - assertThat(getSubTaskId(writeKey2, writeParallelism, maxWriteParallelism)).isEqualTo(0); - assertThat(getSubTaskId(writeKey3, writeParallelism, maxWriteParallelism)).isEqualTo(1); - assertThat(getSubTaskId(writeKey4, writeParallelism, maxWriteParallelism)).isEqualTo(1); - } - - @Test - void testEqualityKeys() throws Exception { - int writeParallelism = 2; - int maxWriteParallelism = 8; - HashKeyGenerator generator = new HashKeyGenerator(16, maxWriteParallelism); - PartitionSpec unpartitioned = PartitionSpec.unpartitioned(); - - GenericRowData row1 = GenericRowData.of(1, StringData.fromString("foo")); - GenericRowData row2 = GenericRowData.of(1, StringData.fromString("bar")); - GenericRowData row3 = GenericRowData.of(2, StringData.fromString("baz")); - Set equalityColumns = Collections.singleton("id"); - - int writeKey1 = - getWriteKey( - generator, - unpartitioned, - DistributionMode.NONE, - writeParallelism, - equalityColumns, - row1); - int writeKey2 = - getWriteKey( - generator, - unpartitioned, - DistributionMode.NONE, - writeParallelism, - equalityColumns, - row2); - int writeKey3 = - getWriteKey( - generator, - unpartitioned, - DistributionMode.NONE, - writeParallelism, - equalityColumns, - row3); - - assertThat(writeKey1).isEqualTo(writeKey2); - assertThat(writeKey2).isNotEqualTo(writeKey3); - - assertThat(getSubTaskId(writeKey1, writeParallelism, maxWriteParallelism)).isEqualTo(1); - assertThat(getSubTaskId(writeKey2, writeParallelism, maxWriteParallelism)).isEqualTo(1); - assertThat(getSubTaskId(writeKey3, writeParallelism, maxWriteParallelism)).isEqualTo(0); - } - - @Test - void testCapAtMaxWriteParallelism() throws Exception { - int writeParallelism = 10; - int maxWriteParallelism = 5; - HashKeyGenerator generator = new HashKeyGenerator(16, maxWriteParallelism); - PartitionSpec unpartitioned = PartitionSpec.unpartitioned(); - - Set writeKeys = Sets.newHashSet(); - for (int i = 0; i < 20; i++) { - GenericRowData row = GenericRowData.of(i, StringData.fromString("z")); - writeKeys.add( - getWriteKey( - generator, - unpartitioned, - DistributionMode.NONE, - writeParallelism, - Collections.emptySet(), - row)); - } - - assertThat(writeKeys).hasSize(maxWriteParallelism); - assertThat( - writeKeys.stream() - .map(key -> getSubTaskId(key, writeParallelism, writeParallelism)) - .distinct() - .count()) - .isEqualTo(maxWriteParallelism); - } - - @Test - void testHashModeWithoutEqualityFieldsFallsBackToNone() throws Exception { - int writeParallelism = 2; - int maxWriteParallelism = 8; - HashKeyGenerator generator = new HashKeyGenerator(16, maxWriteParallelism); - Schema noIdSchema = new Schema(Types.NestedField.required(1, "x", Types.StringType.get())); - PartitionSpec unpartitioned = PartitionSpec.unpartitioned(); - - DynamicRecord record = - new DynamicRecord( - TABLE_IDENTIFIER, - BRANCH, - noIdSchema, - GenericRowData.of(StringData.fromString("v")), - unpartitioned, - DistributionMode.HASH, - writeParallelism); - - int writeKey1 = generator.generateKey(record); - int writeKey2 = generator.generateKey(record); - int writeKey3 = generator.generateKey(record); - assertThat(writeKey1).isNotEqualTo(writeKey2); - assertThat(writeKey3).isEqualTo(writeKey1); - - assertThat(getSubTaskId(writeKey1, writeParallelism, maxWriteParallelism)).isEqualTo(1); - assertThat(getSubTaskId(writeKey2, writeParallelism, maxWriteParallelism)).isEqualTo(0); - assertThat(getSubTaskId(writeKey3, writeParallelism, maxWriteParallelism)).isEqualTo(1); - } - - @Test - void testSchemaSpecOverrides() throws Exception { - int maxCacheSize = 10; - int writeParallelism = 5; - int maxWriteParallelism = 10; - HashKeyGenerator generator = new HashKeyGenerator(maxCacheSize, maxWriteParallelism); - - DynamicRecord record = - new DynamicRecord( - TABLE_IDENTIFIER, - BRANCH, - SCHEMA, - GenericRowData.of(1, StringData.fromString("foo")), - PartitionSpec.unpartitioned(), - DistributionMode.NONE, - writeParallelism); - - int writeKey1 = generator.generateKey(record); - int writeKey2 = generator.generateKey(record); - // Assert that we are bucketing via NONE (round-robin) - assertThat(writeKey1).isNotEqualTo(writeKey2); - - // Schema has different id - Schema overrideSchema = new Schema(42, SCHEMA.columns()); - // Spec has different id - PartitionSpec overrideSpec = PartitionSpec.builderFor(SCHEMA).withSpecId(42).build(); - RowData overrideData = GenericRowData.of(1L, StringData.fromString("foo")); - - // We get a new key selector for the schema which starts off on the same offset - assertThat(generator.generateKey(record, overrideSchema, null, null)).isEqualTo(writeKey1); - // We get a new key selector for the spec which starts off on the same offset - assertThat(generator.generateKey(record, null, overrideSpec, null)).isEqualTo(writeKey1); - // We get the same key selector which yields a different result for the overridden data - assertThat(generator.generateKey(record, null, null, overrideData)).isNotEqualTo(writeKey1); - } - - @Test - void testMultipleTables() throws Exception { - int maxCacheSize = 10; - int writeParallelism = 2; - int maxWriteParallelism = 8; - HashKeyGenerator generator = new HashKeyGenerator(maxCacheSize, maxWriteParallelism); - - PartitionSpec unpartitioned = PartitionSpec.unpartitioned(); - - GenericRowData rowData = GenericRowData.of(1, StringData.fromString("foo")); - - DynamicRecord record1 = - new DynamicRecord( - TableIdentifier.of("a", "table"), - BRANCH, - SCHEMA, - rowData, - unpartitioned, - DistributionMode.HASH, - writeParallelism); - record1.setEqualityFields(Collections.singleton("id")); - DynamicRecord record2 = - new DynamicRecord( - TableIdentifier.of("my", "other", "table"), - BRANCH, - SCHEMA, - rowData, - unpartitioned, - DistributionMode.HASH, - writeParallelism); - record2.setEqualityFields(Collections.singleton("id")); - - // Consistent hashing for the same record due to HASH distribution mode - int writeKeyRecord1 = generator.generateKey(record1); - assertThat(writeKeyRecord1).isEqualTo(generator.generateKey(record1)); - int writeKeyRecord2 = generator.generateKey(record2); - assertThat(writeKeyRecord2).isEqualTo(generator.generateKey(record2)); - - // But the write keys are for different tables and should not be equal - assertThat(writeKeyRecord1).isNotEqualTo(writeKeyRecord2); - - assertThat(getSubTaskId(writeKeyRecord1, writeParallelism, maxWriteParallelism)).isEqualTo(1); - assertThat(getSubTaskId(writeKeyRecord2, writeParallelism, maxWriteParallelism)).isEqualTo(0); - } - - @Test - void testCaching() throws Exception { - int maxCacheSize = 1; - int writeParallelism = 2; - int maxWriteParallelism = 8; - HashKeyGenerator generator = new HashKeyGenerator(maxCacheSize, maxWriteParallelism); - Map> keySelectorCache = - generator.getKeySelectorCache(); - - PartitionSpec unpartitioned = PartitionSpec.unpartitioned(); - DynamicRecord record = - new DynamicRecord( - TABLE_IDENTIFIER, - BRANCH, - SCHEMA, - GenericRowData.of(1, StringData.fromString("foo")), - unpartitioned, - DistributionMode.NONE, - writeParallelism); - - int writeKey1 = generator.generateKey(record); - assertThat(keySelectorCache).hasSize(1); - - int writeKey2 = generator.generateKey(record); - assertThat(writeKey2).isNotEqualTo(writeKey1); - assertThat(keySelectorCache).hasSize(1); - - int writeKey3 = generator.generateKey(record); - assertThat(keySelectorCache).hasSize(1); - // We create a new key selector which will start off at the same position - assertThat(writeKey1).isEqualTo(writeKey3); - } - - private static int getWriteKey( - HashKeyGenerator generator, - PartitionSpec spec, - DistributionMode mode, - int writeParallelism, - Set equalityFields, - GenericRowData row) - throws Exception { - DynamicRecord record = - new DynamicRecord(TABLE_IDENTIFIER, BRANCH, SCHEMA, row, spec, mode, writeParallelism); - record.setEqualityFields(equalityFields); - return generator.generateKey(record); - } - - private static int getSubTaskId(int writeKey1, int writeParallelism, int maxWriteParallelism) { - return KeyGroupRangeAssignment.assignKeyToParallelOperator( - writeKey1, maxWriteParallelism, writeParallelism); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestLRUCache.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestLRUCache.java deleted file mode 100644 index 679d3de978a3..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestLRUCache.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.List; -import java.util.Map; -import java.util.function.Consumer; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.Test; - -class TestLRUCache { - private static final Consumer> NO_OP_CALLBACK = ignored -> {}; - - @Test - void testPut() { - LRUCache cache = new LRUCache<>(1, NO_OP_CALLBACK); - cache.put(1, 1); - - assertThat(cache).hasSize(1).containsEntry(1, 1); - } - - @Test - void testGet() { - LRUCache cache = new LRUCache<>(1, NO_OP_CALLBACK); - cache.put(1, 123); - - assertThat(cache).hasSize(1); - assertThat(cache.get(1)).isEqualTo(123); - } - - @Test - void testElementEviction() { - int maxSize = 2; - LRUCache cache = new LRUCache<>(maxSize, NO_OP_CALLBACK); - - cache.put(1, 1); - cache.put(2, 2); - Integer value = cache.get(1); - assertThat(value).isEqualTo(1); - - cache.put(3, 3); // "2" should be evicted - - assertThat(cache).hasSize(2).containsEntry(1, 1).containsEntry(3, 3); - } - - @Test - void testEvictionCallback() { - int maxSize = 2; - TestEvictionCallback callback = new TestEvictionCallback(); - LRUCache cache = new LRUCache<>(maxSize, callback); - - cache.put(1, 1); - cache.put(2, 2); - Integer value = cache.get(1); - assertThat(value).isEqualTo(1); - - cache.put(3, 3); // "2" should be evicted - - assertThat(callback.evictedEntries).containsExactly(Map.entry(2, 2)); - } - - private static class TestEvictionCallback implements Consumer> { - private final List> evictedEntries = Lists.newArrayList(); - - @Override - public void accept(Map.Entry entry) { - evictedEntries.add(entry); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestPartitionSpecEvolution.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestPartitionSpecEvolution.java deleted file mode 100644 index 3e7025de6f91..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestPartitionSpecEvolution.java +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; - -public class TestPartitionSpecEvolution { - - @Test - void testCompatible() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.IntegerType.get()), - Types.NestedField.required(1, "data", Types.StringType.get())); - - PartitionSpec spec1 = PartitionSpec.builderFor(schema).bucket("id", 10).build(); - PartitionSpec spec2 = PartitionSpec.builderFor(schema).bucket("id", 10).build(); - - // Happy case, source ids and names match - assertThat(PartitionSpecEvolution.checkCompatibility(spec1, spec2)).isTrue(); - } - - @Test - void testNotCompatibleDifferentTransform() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.IntegerType.get()), - Types.NestedField.required(1, "data", Types.StringType.get())); - - PartitionSpec spec1 = PartitionSpec.builderFor(schema).bucket("id", 10).build(); - // Same spec als spec1 but different number of buckets - PartitionSpec spec2 = PartitionSpec.builderFor(schema).bucket("id", 23).build(); - - assertThat(PartitionSpecEvolution.checkCompatibility(spec1, spec2)).isFalse(); - } - - @Test - void testNotCompatibleMoreFields() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.IntegerType.get()), - Types.NestedField.required(1, "data", Types.StringType.get())); - - PartitionSpec spec1 = PartitionSpec.builderFor(schema).bucket("id", 10).build(); - // Additional field - PartitionSpec spec2 = - PartitionSpec.builderFor(schema).bucket("id", 10).truncate("data", 1).build(); - - assertThat(PartitionSpecEvolution.checkCompatibility(spec1, spec2)).isFalse(); - } - - @Test - void testCompatibleWithNonMatchingSourceIds() { - Schema schema1 = - new Schema( - // Use zero-based field ids - Types.NestedField.required(0, "id", Types.IntegerType.get()), - Types.NestedField.required(1, "data", Types.StringType.get())); - - PartitionSpec spec1 = PartitionSpec.builderFor(schema1).bucket("id", 10).build(); - - Schema schema2 = - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get())); - - // Same spec als spec1 but bound to a different schema - PartitionSpec spec2 = PartitionSpec.builderFor(schema2).bucket("id", 10).build(); - - // Compatible because the source names match - assertThat(PartitionSpecEvolution.checkCompatibility(spec1, spec2)).isTrue(); - } - - @Test - void testPartitionSpecEvolution() { - Schema schema1 = - new Schema( - Types.NestedField.required(0, "id", Types.IntegerType.get()), - Types.NestedField.required(1, "data", Types.StringType.get())); - - PartitionSpec spec1 = PartitionSpec.builderFor(schema1).bucket("id", 10).build(); - - Schema schema2 = - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get())); - - // Change num buckets - PartitionSpec spec2 = PartitionSpec.builderFor(schema2).bucket("id", 23).build(); - - assertThat(PartitionSpecEvolution.checkCompatibility(spec1, spec2)).isFalse(); - PartitionSpecEvolution.PartitionSpecChanges result = - PartitionSpecEvolution.evolve(spec1, spec2); - - assertThat(result.termsToAdd().toString()).isEqualTo("[bucket[23](ref(name=\"id\"))]"); - assertThat(result.termsToRemove().toString()).isEqualTo("[bucket[10](ref(name=\"id\"))]"); - } - - @Test - void testPartitionSpecEvolutionAddField() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.IntegerType.get()), - Types.NestedField.required(1, "data", Types.StringType.get())); - - PartitionSpec spec1 = PartitionSpec.builderFor(schema).build(); - // Add field - PartitionSpec spec2 = PartitionSpec.builderFor(schema).bucket("id", 23).build(); - - assertThat(PartitionSpecEvolution.checkCompatibility(spec1, spec2)).isFalse(); - PartitionSpecEvolution.PartitionSpecChanges result = - PartitionSpecEvolution.evolve(spec1, spec2); - - assertThat(result.termsToAdd().toString()).isEqualTo("[bucket[23](ref(name=\"id\"))]"); - assertThat(result.termsToRemove().toString()).isEqualTo("[]"); - } - - @Test - void testPartitionSpecEvolutionRemoveField() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.IntegerType.get()), - Types.NestedField.required(1, "data", Types.StringType.get())); - - PartitionSpec spec1 = PartitionSpec.builderFor(schema).bucket("id", 23).build(); - // Remove field - PartitionSpec spec2 = PartitionSpec.builderFor(schema).build(); - - assertThat(PartitionSpecEvolution.checkCompatibility(spec1, spec2)).isFalse(); - PartitionSpecEvolution.PartitionSpecChanges result = - PartitionSpecEvolution.evolve(spec1, spec2); - - assertThat(result.termsToAdd().toString()).isEqualTo("[]"); - assertThat(result.termsToRemove().toString()).isEqualTo("[bucket[23](ref(name=\"id\"))]"); - } - - @Test - void testPartitionSpecEvolutionWithNestedFields() { - Schema schema1 = - new Schema( - Types.NestedField.required(0, "id", Types.IntegerType.get()), - Types.NestedField.required( - 1, - "data", - Types.StructType.of(Types.NestedField.required(2, "str", Types.StringType.get())))); - - PartitionSpec spec1 = PartitionSpec.builderFor(schema1).bucket("data.str", 10).build(); - - Schema schema2 = - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required( - 2, - "data", - Types.StructType.of(Types.NestedField.required(3, "str", Types.StringType.get())))); - - // Change num buckets - PartitionSpec spec2 = PartitionSpec.builderFor(schema2).bucket("data.str", 23).build(); - - assertThat(PartitionSpecEvolution.checkCompatibility(spec1, spec2)).isFalse(); - PartitionSpecEvolution.PartitionSpecChanges result = - PartitionSpecEvolution.evolve(spec1, spec2); - - assertThat(result.termsToAdd().toString()).isEqualTo("[bucket[23](ref(name=\"data.str\"))]"); - assertThat(result.termsToRemove().toString()).isEqualTo("[bucket[10](ref(name=\"data.str\"))]"); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestRowDataConverter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestRowDataConverter.java deleted file mode 100644 index c4a86bb79e4a..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestRowDataConverter.java +++ /dev/null @@ -1,262 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.math.BigDecimal; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.DataGenerator; -import org.apache.iceberg.flink.DataGenerators; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Types; -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; -import org.joda.time.Days; -import org.junit.jupiter.api.Test; - -class TestRowDataConverter { - - static final Schema SCHEMA = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); - - static final Schema SCHEMA2 = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "onemore", Types.DoubleType.get())); - - @Test - void testPrimitiveTypes() { - DataGenerator generator = new DataGenerators.Primitives(); - assertThat( - convert( - generator.generateFlinkRowData(), - generator.icebergSchema(), - generator.icebergSchema())) - .isEqualTo(generator.generateFlinkRowData()); - } - - @Test - void testAddColumn() { - assertThat(convert(SimpleDataUtil.createRowData(1, "a"), SCHEMA, SCHEMA2)) - .isEqualTo(GenericRowData.of(1, StringData.fromString("a"), null)); - } - - @Test - void testAddRequiredColumn() { - Schema currentSchema = new Schema(Types.NestedField.optional(1, "id", Types.IntegerType.get())); - Schema targetSchema = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - required(2, "data", Types.StringType.get())); - - assertThatThrownBy(() -> convert(GenericRowData.of(42), currentSchema, targetSchema)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("is non-nullable but does not exist in source schema"); - } - - @Test - void testIntToLong() { - Schema schemaWithLong = - new Schema( - Types.NestedField.optional(2, "id", Types.LongType.get()), - Types.NestedField.optional(4, "data", Types.StringType.get())); - - assertThat(convert(SimpleDataUtil.createRowData(1, "a"), SimpleDataUtil.SCHEMA, schemaWithLong)) - .isEqualTo(GenericRowData.of(1L, StringData.fromString("a"))); - } - - @Test - void testFloatToDouble() { - Schema schemaWithFloat = - new Schema(Types.NestedField.optional(1, "float2double", Types.FloatType.get())); - Schema schemaWithDouble = - new Schema(Types.NestedField.optional(2, "float2double", Types.DoubleType.get())); - - assertThat(convert(GenericRowData.of(1.5f), schemaWithFloat, schemaWithDouble)) - .isEqualTo(GenericRowData.of(1.5d)); - } - - @Test - void testDateToTimestamp() { - Schema schemaWithFloat = - new Schema(Types.NestedField.optional(1, "date2timestamp", Types.DateType.get())); - Schema schemaWithDouble = - new Schema( - Types.NestedField.optional(2, "date2timestamp", Types.TimestampType.withoutZone())); - - DateTime time = new DateTime(2022, 1, 10, 0, 0, 0, 0, DateTimeZone.UTC); - int days = - Days.daysBetween(new DateTime(1970, 1, 1, 0, 0, 0, 0, DateTimeZone.UTC), time).getDays(); - - assertThat(convert(GenericRowData.of(days), schemaWithFloat, schemaWithDouble)) - .isEqualTo(GenericRowData.of(TimestampData.fromEpochMillis(time.getMillis()))); - } - - @Test - void testIncreasePrecision() { - Schema before = - new Schema(Types.NestedField.required(14, "decimal_field", Types.DecimalType.of(9, 2))); - Schema after = - new Schema(Types.NestedField.required(14, "decimal_field", Types.DecimalType.of(10, 2))); - - assertThat( - convert( - GenericRowData.of(DecimalData.fromBigDecimal(new BigDecimal("-1.50"), 9, 2)), - before, - after)) - .isEqualTo(GenericRowData.of(DecimalData.fromBigDecimal(new BigDecimal("-1.50"), 10, 2))); - } - - @Test - void testStructAddOptionalFields() { - DataGenerator generator = new DataGenerators.StructOfPrimitive(); - RowData oldData = generator.generateFlinkRowData(); - Schema oldSchema = generator.icebergSchema(); - Types.NestedField structField = oldSchema.columns().get(1); - Schema newSchema = - new Schema( - oldSchema.columns().get(0), - Types.NestedField.required( - 10, - structField.name(), - Types.StructType.of( - required(101, "id", Types.IntegerType.get()), - optional(103, "optional", Types.StringType.get()), - required(102, "name", Types.StringType.get())))); - RowData newData = - GenericRowData.of( - StringData.fromString("row_id_value"), - GenericRowData.of(1, null, StringData.fromString("Jane"))); - - assertThat(convert(oldData, oldSchema, newSchema)).isEqualTo(newData); - } - - @Test - void testStructAddRequiredFieldsWithOptionalRoot() { - DataGenerator generator = new DataGenerators.StructOfPrimitive(); - RowData oldData = generator.generateFlinkRowData(); - Schema oldSchema = generator.icebergSchema(); - Types.NestedField structField = oldSchema.columns().get(1); - Schema newSchema = - new Schema( - oldSchema.columns().get(0), - Types.NestedField.optional( - 10, - "newFieldOptionalField", - Types.StructType.of( - Types.NestedField.optional( - structField.fieldId(), - structField.name(), - Types.StructType.of( - optional(101, "id", Types.IntegerType.get()), - // Required columns which leads to nulling the entire struct - required(103, "required", Types.StringType.get()), - required(102, "name", Types.StringType.get())))))); - - RowData expectedData = GenericRowData.of(StringData.fromString("row_id_value"), null); - - assertThat(convert(oldData, oldSchema, newSchema)).isEqualTo(expectedData); - } - - @Test - void testStructAddRequiredFields() { - DataGenerator generator = new DataGenerators.StructOfPrimitive(); - RowData oldData = generator.generateFlinkRowData(); - Schema oldSchema = generator.icebergSchema(); - Types.NestedField structField = oldSchema.columns().get(1); - Schema newSchema = - new Schema( - oldSchema.columns().get(0), - Types.NestedField.required( - 10, - structField.name(), - Types.StructType.of( - required(101, "id", Types.IntegerType.get()), - required(103, "required", Types.StringType.get()), - required(102, "name", Types.StringType.get())))); - - assertThatThrownBy(() -> convert(oldData, oldSchema, newSchema)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("is non-nullable but does not exist in source schema"); - } - - @Test - void testMap() { - DataGenerator generator = new DataGenerators.MapOfPrimitives(); - RowData oldData = generator.generateFlinkRowData(); - Schema oldSchema = generator.icebergSchema(); - Types.NestedField mapField = oldSchema.columns().get(1); - Schema newSchema = - new Schema( - oldSchema.columns().get(0), - Types.NestedField.optional( - 10, - mapField.name(), - Types.MapType.ofRequired(101, 102, Types.StringType.get(), Types.LongType.get()))); - RowData newData = - GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericMapData( - ImmutableMap.of( - StringData.fromString("Jane"), 1L, StringData.fromString("Joe"), 2L))); - - assertThat(convert(oldData, oldSchema, newSchema)).isEqualTo(newData); - } - - @Test - void testArray() { - DataGenerator generator = new DataGenerators.ArrayOfPrimitive(); - RowData oldData = generator.generateFlinkRowData(); - Schema oldSchema = generator.icebergSchema(); - Types.NestedField arrayField = oldSchema.columns().get(1); - Schema newSchema = - new Schema( - oldSchema.columns().get(0), - Types.NestedField.optional( - 10, arrayField.name(), Types.ListType.ofOptional(101, Types.LongType.get()))); - RowData newData = - GenericRowData.of( - StringData.fromString("row_id_value"), new GenericArrayData(new Long[] {1L, 2L, 3L})); - - assertThat(convert(oldData, oldSchema, newSchema)).isEqualTo(newData); - } - - private static RowData convert(RowData sourceData, Schema sourceSchema, Schema targetSchema) { - return (RowData) - DataConverter.get( - FlinkSchemaUtil.convert(sourceSchema), FlinkSchemaUtil.convert(targetSchema)) - .convert(sourceData); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableMetadataCache.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableMetadataCache.java deleted file mode 100644 index 2264cc3a8db0..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableMetadataCache.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.commons.lang3.SerializationUtils; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.sink.TestFlinkIcebergSinkBase; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; - -public class TestTableMetadataCache extends TestFlinkIcebergSinkBase { - - static final Schema SCHEMA = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); - - static final Schema SCHEMA2 = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "extra", Types.StringType.get())); - - @Test - void testCaching() { - Catalog catalog = CATALOG_EXTENSION.catalog(); - TableIdentifier tableIdentifier = TableIdentifier.parse("default.myTable"); - catalog.createTable(tableIdentifier, SCHEMA); - TableMetadataCache cache = new TableMetadataCache(catalog, 10, Long.MAX_VALUE, 10); - - Schema schema1 = cache.schema(tableIdentifier, SCHEMA).resolvedTableSchema(); - assertThat(schema1.sameSchema(SCHEMA)).isTrue(); - assertThat( - cache.schema(tableIdentifier, SerializationUtils.clone(SCHEMA)).resolvedTableSchema()) - .isEqualTo(schema1); - - assertThat(cache.schema(tableIdentifier, SCHEMA2)).isEqualTo(TableMetadataCache.NOT_FOUND); - - schema1 = cache.schema(tableIdentifier, SCHEMA).resolvedTableSchema(); - assertThat( - cache.schema(tableIdentifier, SerializationUtils.clone(SCHEMA)).resolvedTableSchema()) - .isEqualTo(schema1); - } - - @Test - void testCacheInvalidationAfterSchemaChange() { - Catalog catalog = CATALOG_EXTENSION.catalog(); - TableIdentifier tableIdentifier = TableIdentifier.parse("default.myTable"); - catalog.createTable(tableIdentifier, SCHEMA); - TableMetadataCache cache = new TableMetadataCache(catalog, 10, Long.MAX_VALUE, 10); - TableUpdater tableUpdater = new TableUpdater(cache, catalog); - - Schema schema1 = cache.schema(tableIdentifier, SCHEMA).resolvedTableSchema(); - assertThat(schema1.sameSchema(SCHEMA)).isTrue(); - - catalog.dropTable(tableIdentifier); - catalog.createTable(tableIdentifier, SCHEMA2); - tableUpdater.update(tableIdentifier, "main", SCHEMA2, PartitionSpec.unpartitioned()); - - Schema schema2 = cache.schema(tableIdentifier, SCHEMA2).resolvedTableSchema(); - assertThat(schema2.sameSchema(SCHEMA2)).isTrue(); - } - - @Test - void testCachingDisabled() { - Catalog catalog = CATALOG_EXTENSION.catalog(); - TableIdentifier tableIdentifier = TableIdentifier.parse("default.myTable"); - catalog.createTable(tableIdentifier, SCHEMA); - TableMetadataCache cache = new TableMetadataCache(catalog, 0, Long.MAX_VALUE, 10); - - assertThat(cache.getInternalCache()).isEmpty(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableSerializerCache.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableSerializerCache.java deleted file mode 100644 index ec610a3357ba..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableSerializerCache.java +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import static org.apache.iceberg.types.Types.DoubleType; -import static org.apache.iceberg.types.Types.LongType; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.apache.iceberg.types.Types.StringType; -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.function.Supplier; -import org.apache.flink.api.java.tuple.Tuple3; -import org.apache.flink.table.runtime.typeutils.RowDataSerializer; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.CatalogLoader; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.RegisterExtension; - -public class TestTableSerializerCache { - - @RegisterExtension - static final HadoopCatalogExtension CATALOG_EXTENSION = new HadoopCatalogExtension("db", "table"); - - Schema schema1 = new Schema(23, required(1, "id", LongType.get())); - - Schema schema2 = - new Schema( - 42, - required(1, "id", LongType.get()), - optional(2, "data", StringType.get()), - optional(3, "double", DoubleType.get())); - - TableSerializerCache cache = new TableSerializerCache(CATALOG_EXTENSION.catalogLoader(), 10); - - @Test - void testFullSchemaCaching() { - Supplier creator1a = - () -> cache.serializer("table", schema1, PartitionSpec.unpartitioned()); - Supplier creator1b = - () -> cache.serializer("table", schema2, PartitionSpec.unpartitioned()); - Supplier creator2 = - () -> cache.serializer("table2", schema2, PartitionSpec.unpartitioned()); - - RowDataSerializer serializer1a = creator1a.get(); - RowDataSerializer serializer1b = creator1b.get(); - RowDataSerializer serializer2 = creator2.get(); - assertThat(serializer1a).isNotSameAs(serializer1b).isNotSameAs(serializer2); - - assertThat(serializer1a).isSameAs(creator1a.get()); - assertThat(serializer1b).isSameAs(creator1b.get()); - assertThat(serializer2).isSameAs(creator2.get()); - } - - @Test - void testCachingWithSchemaLookup() { - CatalogLoader catalogLoader = CATALOG_EXTENSION.catalogLoader(); - cache = new TableSerializerCache(catalogLoader, 10); - - Catalog catalog = catalogLoader.loadCatalog(); - Table table = catalog.createTable(TableIdentifier.of("table"), schema1); - - Tuple3 serializerWithSchemaAndSpec = - cache.serializerWithSchemaAndSpec( - "table", table.schema().schemaId(), PartitionSpec.unpartitioned().specId()); - assertThat(serializerWithSchemaAndSpec).isNotNull(); - assertThat(serializerWithSchemaAndSpec.f0).isNotNull(); - assertThat(serializerWithSchemaAndSpec.f1.sameSchema(table.schema())).isTrue(); - assertThat(serializerWithSchemaAndSpec.f2).isEqualTo(table.spec()); - - Tuple3 serializerWithSchemaAndSpec2 = - cache.serializerWithSchemaAndSpec( - "table", table.schema().schemaId(), PartitionSpec.unpartitioned().specId()); - - assertThat(serializerWithSchemaAndSpec.f0).isSameAs(serializerWithSchemaAndSpec2.f0); - assertThat(serializerWithSchemaAndSpec.f1).isSameAs(serializerWithSchemaAndSpec2.f1); - assertThat(serializerWithSchemaAndSpec.f2).isSameAs(serializerWithSchemaAndSpec2.f2); - } - - @Test - void testCacheEviction() { - cache = new TableSerializerCache(CATALOG_EXTENSION.catalogLoader(), 0); - assertThat(cache.maximumSize()).isEqualTo(0); - - Supplier creator1 = - () -> cache.serializer("table", schema1, PartitionSpec.unpartitioned()); - Supplier creator2 = - () -> cache.serializer("table2", schema2, PartitionSpec.unpartitioned()); - - RowDataSerializer serializer1 = creator1.get(); - RowDataSerializer serializer2 = creator2.get(); - - cache.getCache().clear(); - assertThat(serializer1).isNotSameAs(creator1.get()); - assertThat(serializer2).isNotSameAs(creator2.get()); - } - - @Test - void testCacheSize() { - cache = new TableSerializerCache(CATALOG_EXTENSION.catalogLoader(), 1000); - assertThat(cache.maximumSize()).isEqualTo(1000); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableUpdater.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableUpdater.java deleted file mode 100644 index ad35d929728d..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestTableUpdater.java +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.dynamic; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.Map; -import org.apache.flink.api.java.tuple.Tuple2; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.sink.TestFlinkIcebergSinkBase; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; - -public class TestTableUpdater extends TestFlinkIcebergSinkBase { - - static final Schema SCHEMA = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); - - static final Schema SCHEMA2 = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "extra", Types.StringType.get())); - - @Test - void testTableCreation() { - Catalog catalog = CATALOG_EXTENSION.catalog(); - TableIdentifier tableIdentifier = TableIdentifier.parse("myTable"); - TableMetadataCache cache = new TableMetadataCache(catalog, 10, Long.MAX_VALUE, 10); - TableUpdater tableUpdater = new TableUpdater(cache, catalog); - - tableUpdater.update(tableIdentifier, "main", SCHEMA, PartitionSpec.unpartitioned()); - assertThat(catalog.tableExists(tableIdentifier)).isTrue(); - - TableMetadataCache.ResolvedSchemaInfo cachedSchema = cache.schema(tableIdentifier, SCHEMA); - assertThat(cachedSchema.resolvedTableSchema().sameSchema(SCHEMA)).isTrue(); - } - - @Test - void testTableAlreadyExists() { - Catalog catalog = CATALOG_EXTENSION.catalog(); - TableIdentifier tableIdentifier = TableIdentifier.parse("myTable"); - TableMetadataCache cache = new TableMetadataCache(catalog, 10, Long.MAX_VALUE, 10); - TableUpdater tableUpdater = new TableUpdater(cache, catalog); - - // Make the table non-existent in cache - cache.exists(tableIdentifier); - // Create the table - catalog.createTable(tableIdentifier, SCHEMA); - // Make sure that the cache is invalidated and the table refreshed without an error - Tuple2 result = - tableUpdater.update(tableIdentifier, "main", SCHEMA, PartitionSpec.unpartitioned()); - assertThat(result.f0.resolvedTableSchema().sameSchema(SCHEMA)).isTrue(); - assertThat(result.f0.compareResult()).isEqualTo(CompareSchemasVisitor.Result.SAME); - assertThat(result.f1).isEqualTo(PartitionSpec.unpartitioned()); - } - - @Test - void testBranchCreationAndCaching() { - Catalog catalog = CATALOG_EXTENSION.catalog(); - TableIdentifier tableIdentifier = TableIdentifier.parse("myTable"); - TableMetadataCache cache = new TableMetadataCache(catalog, 10, Long.MAX_VALUE, 10); - TableUpdater tableUpdater = new TableUpdater(cache, catalog); - - catalog.createTable(tableIdentifier, SCHEMA); - tableUpdater.update(tableIdentifier, "myBranch", SCHEMA, PartitionSpec.unpartitioned()); - TableMetadataCache.CacheItem cacheItem = cache.getInternalCache().get(tableIdentifier); - assertThat(cacheItem).isNotNull(); - - tableUpdater.update(tableIdentifier, "myBranch", SCHEMA, PartitionSpec.unpartitioned()); - assertThat(cache.getInternalCache()).contains(Map.entry(tableIdentifier, cacheItem)); - } - - @Test - void testSpecCreation() { - Catalog catalog = CATALOG_EXTENSION.catalog(); - TableIdentifier tableIdentifier = TableIdentifier.parse("myTable"); - TableMetadataCache cache = new TableMetadataCache(catalog, 10, Long.MAX_VALUE, 10); - TableUpdater tableUpdater = new TableUpdater(cache, catalog); - - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).bucket("data", 10).build(); - tableUpdater.update(tableIdentifier, "main", SCHEMA, spec); - - Table table = catalog.loadTable(tableIdentifier); - assertThat(table).isNotNull(); - assertThat(table.spec()).isEqualTo(spec); - } - - @Test - void testInvalidateOldCacheEntryOnUpdate() { - Catalog catalog = CATALOG_EXTENSION.catalog(); - TableIdentifier tableIdentifier = TableIdentifier.parse("default.myTable"); - catalog.createTable(tableIdentifier, SCHEMA); - TableMetadataCache cache = new TableMetadataCache(catalog, 10, Long.MAX_VALUE, 10); - cache.schema(tableIdentifier, SCHEMA); - TableUpdater tableUpdater = new TableUpdater(cache, catalog); - - Schema updated = - tableUpdater - .update(tableIdentifier, "main", SCHEMA2, PartitionSpec.unpartitioned()) - .f0 - .resolvedTableSchema(); - assertThat(updated.sameSchema(SCHEMA2)).isTrue(); - assertThat(cache.schema(tableIdentifier, SCHEMA2).resolvedTableSchema().sameSchema(SCHEMA2)) - .isTrue(); - } - - @Test - void testLastResultInvalidation() { - Catalog catalog = CATALOG_EXTENSION.catalog(); - TableIdentifier tableIdentifier = TableIdentifier.parse("default.myTable"); - catalog.createTable(tableIdentifier, SCHEMA); - TableMetadataCache cache = new TableMetadataCache(catalog, 10, Long.MAX_VALUE, 10); - TableUpdater tableUpdater = new TableUpdater(cache, catalog); - - // Initialize cache - tableUpdater.update(tableIdentifier, "main", SCHEMA, PartitionSpec.unpartitioned()); - - // Update table behind the scenes - catalog.dropTable(tableIdentifier); - catalog.createTable(tableIdentifier, SCHEMA2); - - // Cache still stores the old information - assertThat(cache.schema(tableIdentifier, SCHEMA2).compareResult()) - .isEqualTo(CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED); - - assertThat( - tableUpdater - .update(tableIdentifier, "main", SCHEMA2, PartitionSpec.unpartitioned()) - .f0 - .compareResult()) - .isEqualTo(CompareSchemasVisitor.Result.SAME); - - // Last result cache should be cleared - assertThat(cache.getInternalCache().get(tableIdentifier).inputSchemas()) - .doesNotContainKey(SCHEMA2); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java deleted file mode 100644 index b0d98b358b6d..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.nio.ByteBuffer; -import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.NavigableMap; -import java.util.UUID; -import java.util.concurrent.ThreadLocalRandom; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -public class DataDistributionUtil { - private DataDistributionUtil() {} - - private static final String CHARS = - "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-.!?"; - - /** Generate a random string with a given prefix and a random length up to maxLength. */ - public static String randomString(String prefix, int maxLength) { - int length = ThreadLocalRandom.current().nextInt(maxLength); - byte[] buffer = new byte[length]; - - for (int i = 0; i < length; i += 1) { - buffer[i] = (byte) CHARS.charAt(ThreadLocalRandom.current().nextInt(CHARS.length())); - } - - return prefix + new String(buffer, StandardCharsets.UTF_8); - } - - /** - * return index if index == 0 && weightsUDF[index] > target (or) weightsUDF[index-1] <= target && - * weightsUDF[index] > target - */ - public static int binarySearchIndex(long[] weightsCDF, long target) { - Preconditions.checkArgument( - target >= 0, "target weight must be non-negative: search target = %s", target); - Preconditions.checkArgument( - target < weightsCDF[weightsCDF.length - 1], - "target weight is out of range: total weight = %s, search target = %s", - weightsCDF[weightsCDF.length - 1], - target); - - int start = 0; - int end = weightsCDF.length - 1; - while (start <= end) { - int mid = (start + end) / 2; - boolean leftOk = (mid == 0) || (weightsCDF[mid - 1] <= target); - boolean rightOk = weightsCDF[mid] > target; - if (leftOk && rightOk) { - return mid; - } else if (weightsCDF[mid] <= target) { - start = mid + 1; - } else { - end = mid - 1; - } - } - - throw new IllegalStateException("should never reach here"); - } - - /** Key is the id string and value is the weight in long value. */ - public static NavigableMap longTailDistribution( - long startingWeight, - int longTailStartingIndex, - int longTailLength, - long longTailBaseWeight, - double weightRandomJitterPercentage, - double decayFactor) { - - NavigableMap weights = Maps.newTreeMap(); - - // decay part - long currentWeight = startingWeight; - for (int index = 0; index < longTailStartingIndex; ++index) { - double jitter = ThreadLocalRandom.current().nextDouble(weightRandomJitterPercentage / 100); - long weight = (long) (currentWeight * (1.0 + jitter)); - weight = weight > 0 ? weight : 1; - weights.put(index, weight); - if (currentWeight > longTailBaseWeight) { - currentWeight = (long) (currentWeight * decayFactor); // decay the weight by 40% - } - } - - // long tail part (flat with some random jitter) - for (int index = longTailStartingIndex; - index < longTailStartingIndex + longTailLength; - ++index) { - long longTailWeight = - (long) - (longTailBaseWeight - * ThreadLocalRandom.current().nextDouble(weightRandomJitterPercentage)); - longTailWeight = longTailWeight > 0 ? longTailWeight : 1; - weights.put(index, longTailWeight); - } - - return weights; - } - - public static Map mapStatisticsWithLongTailDistribution( - NavigableMap weights, SortKey sortKey) { - Map mapStatistics = Maps.newHashMapWithExpectedSize(weights.size()); - weights.forEach( - (id, weight) -> { - SortKey sortKeyCopy = sortKey.copy(); - sortKeyCopy.set(0, id); - mapStatistics.put(sortKeyCopy, weight); - }); - - return mapStatistics; - } - - public static long[] computeCumulativeWeights(List keys, Map weights) { - long[] weightsCDF = new long[keys.size()]; - long totalWeight = 0; - for (int i = 0; i < keys.size(); ++i) { - totalWeight += weights.get(keys.get(i)); - weightsCDF[i] = totalWeight; - } - - return weightsCDF; - } - - public static byte[] uuidBytes(UUID uuid) { - ByteBuffer bb = ByteBuffer.wrap(new byte[16]); - bb.putLong(uuid.getMostSignificantBits()); - bb.putLong(uuid.getLeastSignificantBits()); - return bb.array(); - } - - public static UUID[] reservoirSampleUUIDs(int sampleSize, int reservoirSize) { - UUID[] reservoir = new UUID[reservoirSize]; - for (int i = 0; i < reservoirSize; ++i) { - reservoir[i] = UUID.randomUUID(); - } - - ThreadLocalRandom random = ThreadLocalRandom.current(); - for (int i = reservoirSize; i < sampleSize; ++i) { - int rand = random.nextInt(i + 1); - if (rand < reservoirSize) { - reservoir[rand] = UUID.randomUUID(); - } - } - - Arrays.sort(reservoir); - return reservoir; - } - - public static UUID[] rangeBoundSampleUUIDs(UUID[] sampledUUIDs, int rangeBoundSize) { - UUID[] rangeBounds = new UUID[rangeBoundSize]; - int step = sampledUUIDs.length / rangeBoundSize; - for (int i = 0; i < rangeBoundSize; ++i) { - rangeBounds[i] = sampledUUIDs[i * step]; - } - Arrays.sort(rangeBounds); - return rangeBounds; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java deleted file mode 100644 index 5910bd685510..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Comparator; -import java.util.Map; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.runtime.typeutils.RowDataSerializer; -import org.apache.flink.table.types.logical.IntType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.VarCharType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.SortOrderComparators; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; - -class Fixtures { - private Fixtures() {} - - public static final int NUM_SUBTASKS = 2; - public static final Schema SCHEMA = - new Schema( - Types.NestedField.optional(1, "id", Types.StringType.get()), - Types.NestedField.optional(2, "number", Types.IntegerType.get())); - public static final RowType ROW_TYPE = RowType.of(new VarCharType(), new IntType()); - public static final TypeSerializer ROW_SERIALIZER = new RowDataSerializer(ROW_TYPE); - public static final RowDataWrapper ROW_WRAPPER = new RowDataWrapper(ROW_TYPE, SCHEMA.asStruct()); - public static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); - public static final Comparator SORT_ORDER_COMPARTOR = - SortOrderComparators.forSchema(SCHEMA, SORT_ORDER); - public static final SortKeySerializer SORT_KEY_SERIALIZER = - new SortKeySerializer(SCHEMA, SORT_ORDER); - public static final DataStatisticsSerializer TASK_STATISTICS_SERIALIZER = - new DataStatisticsSerializer(SORT_KEY_SERIALIZER); - public static final GlobalStatisticsSerializer GLOBAL_STATISTICS_SERIALIZER = - new GlobalStatisticsSerializer(SORT_KEY_SERIALIZER); - public static final CompletedStatisticsSerializer COMPLETED_STATISTICS_SERIALIZER = - new CompletedStatisticsSerializer(SORT_KEY_SERIALIZER); - - public static final SortKey SORT_KEY = new SortKey(SCHEMA, SORT_ORDER); - public static final Map CHAR_KEYS = createCharKeys(); - - public static StatisticsEvent createStatisticsEvent( - StatisticsType type, - TypeSerializer statisticsSerializer, - long checkpointId, - SortKey... keys) { - DataStatistics statistics = createTaskStatistics(type, keys); - return StatisticsEvent.createTaskStatisticsEvent( - checkpointId, statistics, statisticsSerializer); - } - - public static DataStatistics createTaskStatistics(StatisticsType type, SortKey... keys) { - DataStatistics statistics; - if (type == StatisticsType.Sketch) { - statistics = new SketchDataStatistics(128); - } else { - statistics = new MapDataStatistics(); - } - - for (SortKey key : keys) { - statistics.add(key); - } - - return statistics; - } - - private static Map createCharKeys() { - Map keys = Maps.newHashMap(); - for (char c = 'a'; c <= 'z'; ++c) { - String key = Character.toString(c); - SortKey sortKey = SORT_KEY.copy(); - sortKey.set(0, key); - keys.put(key, sortKey); - } - - return keys; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java deleted file mode 100644 index 8322ce683768..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java +++ /dev/null @@ -1,465 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.TASK_STATISTICS_SERIALIZER; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.createStatisticsEvent; -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.EnumSource; - -public class TestAggregatedStatisticsTracker { - @ParameterizedTest - @EnumSource(StatisticsType.class) - public void receiveNewerStatisticsEvent(StatisticsType type) { - AggregatedStatisticsTracker tracker = createTracker(type); - - StatisticsEvent checkpoint1Subtask0StatisticsEvent = - createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("a")); - CompletedStatistics completedStatistics = - tracker.updateAndCheckCompletion(0, checkpoint1Subtask0StatisticsEvent); - assertThat(completedStatistics).isNull(); - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); - AggregatedStatisticsTracker.Aggregation aggregation = - tracker.aggregationsPerCheckpoint().get(1L); - assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L)); - } else { - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder(CHAR_KEYS.get("a")); - } - - StatisticsEvent checkpoint2Subtask0StatisticsEvent = - createStatisticsEvent( - type, - TASK_STATISTICS_SERIALIZER, - 2L, - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b")); - completedStatistics = tracker.updateAndCheckCompletion(0, checkpoint2Subtask0StatisticsEvent); - assertThat(completedStatistics).isNull(); - // both checkpoints are tracked - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L, 2L); - aggregation = tracker.aggregationsPerCheckpoint().get(2L); - assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(aggregation.mapStatistics()) - .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)); - } else { - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder(CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("b")); - } - - StatisticsEvent checkpoint1Subtask1StatisticsEvent = - createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("b")); - completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint1Subtask1StatisticsEvent); - // checkpoint 1 is completed - assertThat(completedStatistics).isNotNull(); - assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - assertThat(completedStatistics.checkpointId()).isEqualTo(1L); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(completedStatistics.keyFrequency()) - .isEqualTo( - ImmutableMap.of( - CHAR_KEYS.get("a"), 1L, - CHAR_KEYS.get("b"), 1L)); - } else { - assertThat(completedStatistics.keySamples()) - .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); - } - - // checkpoint 2 remains - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(2L); - aggregation = tracker.aggregationsPerCheckpoint().get(2L); - assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(aggregation.mapStatistics()) - .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)); - } else { - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder(CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("b")); - } - } - - @ParameterizedTest - @EnumSource(StatisticsType.class) - public void receiveOlderStatisticsEventTest(StatisticsType type) { - AggregatedStatisticsTracker tracker = createTracker(type); - - StatisticsEvent checkpoint2Subtask0StatisticsEvent = - createStatisticsEvent( - type, - TASK_STATISTICS_SERIALIZER, - 2L, - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b")); - CompletedStatistics completedStatistics = - tracker.updateAndCheckCompletion(0, checkpoint2Subtask0StatisticsEvent); - assertThat(completedStatistics).isNull(); - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(2L); - AggregatedStatisticsTracker.Aggregation aggregation = - tracker.aggregationsPerCheckpoint().get(2L); - assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(aggregation.mapStatistics()) - .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)); - } else { - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder(CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("b")); - } - - StatisticsEvent checkpoint1Subtask1StatisticsEvent = - createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("b")); - completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint1Subtask1StatisticsEvent); - assertThat(completedStatistics).isNull(); - // both checkpoints are tracked - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L, 2L); - aggregation = tracker.aggregationsPerCheckpoint().get(1L); - assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("b"), 1L)); - } else { - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder(CHAR_KEYS.get("b")); - } - - StatisticsEvent checkpoint3Subtask0StatisticsEvent = - createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 3L, CHAR_KEYS.get("x")); - completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint3Subtask0StatisticsEvent); - assertThat(completedStatistics).isNull(); - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L, 2L, 3L); - aggregation = tracker.aggregationsPerCheckpoint().get(3L); - assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("x"), 1L)); - } else { - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder(CHAR_KEYS.get("x")); - } - - StatisticsEvent checkpoint2Subtask1StatisticsEvent = - createStatisticsEvent( - type, - TASK_STATISTICS_SERIALIZER, - 2L, - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b")); - completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint2Subtask1StatisticsEvent); - // checkpoint 1 is cleared along with checkpoint 2. checkpoint 3 remains - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(3L); - aggregation = tracker.aggregationsPerCheckpoint().get(3L); - assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("x"), 1L)); - } else { - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder(CHAR_KEYS.get("x")); - } - - assertThat(completedStatistics).isNotNull(); - assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - assertThat(completedStatistics.checkpointId()).isEqualTo(2L); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(completedStatistics.keyFrequency()) - .isEqualTo( - ImmutableMap.of( - CHAR_KEYS.get("a"), 2L, - CHAR_KEYS.get("b"), 4L)); - } else { - assertThat(completedStatistics.keySamples()) - .containsExactly( - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b")); - } - } - - @ParameterizedTest - @EnumSource(StatisticsType.class) - public void receiveCompletedStatisticsEvent(StatisticsType type) { - AggregatedStatisticsTracker tracker = createTracker(type); - - StatisticsEvent checkpoint1Subtask0DataStatisticEvent = - createStatisticsEvent( - type, - TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b")); - - CompletedStatistics completedStatistics = - tracker.updateAndCheckCompletion(0, checkpoint1Subtask0DataStatisticEvent); - assertThat(completedStatistics).isNull(); - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); - AggregatedStatisticsTracker.Aggregation aggregation = - tracker.aggregationsPerCheckpoint().get(1L); - assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(aggregation.mapStatistics()) - .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)); - } else { - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder(CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("b")); - } - - StatisticsEvent checkpoint1Subtask1DataStatisticEvent = - createStatisticsEvent( - type, - TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("a"), - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b")); - - // Receive data statistics from all subtasks at checkpoint 1 - completedStatistics = - tracker.updateAndCheckCompletion(1, checkpoint1Subtask1DataStatisticEvent); - assertThat(tracker.aggregationsPerCheckpoint()).isEmpty(); - - assertThat(completedStatistics).isNotNull(); - assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - assertThat(completedStatistics.checkpointId()).isEqualTo(1L); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(completedStatistics.keyFrequency()) - .isEqualTo( - ImmutableMap.of( - CHAR_KEYS.get("a"), 3L, - CHAR_KEYS.get("b"), 3L)); - } else { - assertThat(completedStatistics.keySamples()) - .containsExactly( - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("a"), - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b")); - } - - StatisticsEvent checkpoint2Subtask0DataStatisticEvent = - createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 2L, CHAR_KEYS.get("a")); - completedStatistics = - tracker.updateAndCheckCompletion(0, checkpoint2Subtask0DataStatisticEvent); - assertThat(completedStatistics).isNull(); - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(2L); - aggregation = tracker.aggregationsPerCheckpoint().get(2L); - assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L)); - } else { - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder(CHAR_KEYS.get("a")); - } - - StatisticsEvent checkpoint2Subtask1DataStatisticEvent = - createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 2L, CHAR_KEYS.get("b")); - // Receive data statistics from all subtasks at checkpoint 2 - completedStatistics = - tracker.updateAndCheckCompletion(1, checkpoint2Subtask1DataStatisticEvent); - assertThat(tracker.aggregationsPerCheckpoint()).isEmpty(); - - assertThat(completedStatistics).isNotNull(); - assertThat(completedStatistics.checkpointId()).isEqualTo(2L); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(completedStatistics.keyFrequency()) - .isEqualTo( - ImmutableMap.of( - CHAR_KEYS.get("a"), 1L, - CHAR_KEYS.get("b"), 1L)); - } else { - assertThat(completedStatistics.keySamples()) - .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); - } - } - - @Test - public void coordinatorSwitchToSketchOverThreshold() { - int parallelism = 3; - int downstreamParallelism = 3; - int switchToSketchThreshold = 3; - AggregatedStatisticsTracker tracker = - new AggregatedStatisticsTracker( - "testOperator", - parallelism, - Fixtures.SCHEMA, - Fixtures.SORT_ORDER, - downstreamParallelism, - StatisticsType.Auto, - switchToSketchThreshold, - null); - - StatisticsEvent checkpoint1Subtask0StatisticsEvent = - createStatisticsEvent( - StatisticsType.Map, - TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b")); - CompletedStatistics completedStatistics = - tracker.updateAndCheckCompletion(0, checkpoint1Subtask0StatisticsEvent); - assertThat(completedStatistics).isNull(); - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); - AggregatedStatisticsTracker.Aggregation aggregation = - tracker.aggregationsPerCheckpoint().get(1L); - assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0); - assertThat(aggregation.currentType()).isEqualTo(StatisticsType.Map); - assertThat(aggregation.sketchStatistics()).isNull(); - assertThat(aggregation.mapStatistics()) - .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 1L)); - - StatisticsEvent checkpoint1Subtask1StatisticsEvent = - createStatisticsEvent( - StatisticsType.Map, - TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("c"), - CHAR_KEYS.get("d")); - completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint1Subtask1StatisticsEvent); - assertThat(completedStatistics).isNull(); - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); - aggregation = tracker.aggregationsPerCheckpoint().get(1L); - assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0, 1); - // converted to sketch statistics as map size is 4 (over the switch threshold of 3) - assertThat(aggregation.currentType()).isEqualTo(StatisticsType.Sketch); - assertThat(aggregation.mapStatistics()).isNull(); - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder( - CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("c"), CHAR_KEYS.get("d")); - - StatisticsEvent checkpoint1Subtask2StatisticsEvent = - createStatisticsEvent( - StatisticsType.Map, - TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("e"), - CHAR_KEYS.get("f")); - completedStatistics = tracker.updateAndCheckCompletion(2, checkpoint1Subtask2StatisticsEvent); - assertThat(tracker.aggregationsPerCheckpoint()).isEmpty(); - assertThat(completedStatistics).isNotNull(); - assertThat(completedStatistics.type()).isEqualTo(StatisticsType.Sketch); - assertThat(completedStatistics.keySamples()) - .containsExactly( - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("d"), - CHAR_KEYS.get("e"), - CHAR_KEYS.get("f")); - } - - @Test - public void coordinatorMapOperatorSketch() { - int parallelism = 3; - int downstreamParallelism = 3; - AggregatedStatisticsTracker tracker = - new AggregatedStatisticsTracker( - "testOperator", - parallelism, - Fixtures.SCHEMA, - Fixtures.SORT_ORDER, - downstreamParallelism, - StatisticsType.Auto, - SketchUtil.COORDINATOR_SKETCH_SWITCH_THRESHOLD, - null); - - // first operator event has map statistics - StatisticsEvent checkpoint1Subtask0StatisticsEvent = - createStatisticsEvent( - StatisticsType.Map, - TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b")); - CompletedStatistics completedStatistics = - tracker.updateAndCheckCompletion(0, checkpoint1Subtask0StatisticsEvent); - assertThat(completedStatistics).isNull(); - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); - AggregatedStatisticsTracker.Aggregation aggregation = - tracker.aggregationsPerCheckpoint().get(1L); - assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0); - assertThat(aggregation.currentType()).isEqualTo(StatisticsType.Map); - assertThat(aggregation.sketchStatistics()).isNull(); - assertThat(aggregation.mapStatistics()) - .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 1L)); - - // second operator event contains sketch statistics - StatisticsEvent checkpoint1Subtask1StatisticsEvent = - createStatisticsEvent( - StatisticsType.Sketch, - TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("c"), - CHAR_KEYS.get("d")); - completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint1Subtask1StatisticsEvent); - assertThat(completedStatistics).isNull(); - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); - aggregation = tracker.aggregationsPerCheckpoint().get(1L); - assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0, 1); - assertThat(aggregation.currentType()).isEqualTo(StatisticsType.Sketch); - assertThat(aggregation.mapStatistics()).isNull(); - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder( - CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("c"), CHAR_KEYS.get("d")); - - // third operator event has Map statistics - StatisticsEvent checkpoint1Subtask2StatisticsEvent = - createStatisticsEvent( - StatisticsType.Map, - TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("e"), - CHAR_KEYS.get("f")); - completedStatistics = tracker.updateAndCheckCompletion(2, checkpoint1Subtask2StatisticsEvent); - assertThat(tracker.aggregationsPerCheckpoint()).isEmpty(); - assertThat(completedStatistics).isNotNull(); - assertThat(completedStatistics.type()).isEqualTo(StatisticsType.Sketch); - assertThat(completedStatistics.keySamples()) - .containsExactly( - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("d"), - CHAR_KEYS.get("e"), - CHAR_KEYS.get("f")); - } - - private AggregatedStatisticsTracker createTracker(StatisticsType type) { - return new AggregatedStatisticsTracker( - "testOperator", - Fixtures.NUM_SUBTASKS, - Fixtures.SCHEMA, - Fixtures.SORT_ORDER, - Fixtures.NUM_SUBTASKS, - type, - SketchUtil.COORDINATOR_SKETCH_SWITCH_THRESHOLD, - null); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java deleted file mode 100644 index 1975d7e8d654..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; -import static org.assertj.core.api.AssertionsForClassTypes.assertThat; - -import org.apache.flink.api.common.typeutils.SerializerTestBase; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.Test; - -public class TestCompletedStatisticsSerializer extends SerializerTestBase { - - @Override - protected TypeSerializer createSerializer() { - return Fixtures.COMPLETED_STATISTICS_SERIALIZER; - } - - @Override - protected int getLength() { - return -1; - } - - @Override - protected Class getTypeClass() { - return CompletedStatistics.class; - } - - @Override - protected CompletedStatistics[] getTestData() { - - return new CompletedStatistics[] { - CompletedStatistics.fromKeyFrequency( - 1L, ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)), - CompletedStatistics.fromKeySamples(2L, new SortKey[] {CHAR_KEYS.get("a"), CHAR_KEYS.get("b")}) - }; - } - - @Test - public void testSerializer() throws Exception { - TypeSerializer completedStatisticsTypeSerializer = createSerializer(); - CompletedStatistics[] data = getTestData(); - DataOutputSerializer output = new DataOutputSerializer(1024); - completedStatisticsTypeSerializer.serialize(data[0], output); - byte[] serializedBytes = output.getCopyOfBuffer(); - - DataInputDeserializer input = new DataInputDeserializer(serializedBytes); - CompletedStatistics deserialized = completedStatisticsTypeSerializer.deserialize(input); - assertThat(deserialized).isEqualTo(data[0]); - } - - @Test - public void testRestoreOldVersionSerializer() throws Exception { - CompletedStatisticsSerializer completedStatisticsTypeSerializer = - (CompletedStatisticsSerializer) createSerializer(); - completedStatisticsTypeSerializer.changeSortKeySerializerVersion(1); - CompletedStatistics[] data = getTestData(); - DataOutputSerializer output = new DataOutputSerializer(1024); - completedStatisticsTypeSerializer.serialize(data[0], output); - byte[] serializedBytes = output.getCopyOfBuffer(); - - completedStatisticsTypeSerializer.changeSortKeySerializerVersionLatest(); - CompletedStatistics completedStatistics = - StatisticsUtil.deserializeCompletedStatistics( - serializedBytes, completedStatisticsTypeSerializer); - assertThat(completedStatistics).isEqualTo(data[0]); - } - - @Test - public void testRestoreNewSerializer() throws Exception { - CompletedStatisticsSerializer completedStatisticsTypeSerializer = - (CompletedStatisticsSerializer) createSerializer(); - CompletedStatistics[] data = getTestData(); - DataOutputSerializer output = new DataOutputSerializer(1024); - completedStatisticsTypeSerializer.serialize(data[0], output); - byte[] serializedBytes = output.getCopyOfBuffer(); - - CompletedStatistics completedStatistics = - StatisticsUtil.deserializeCompletedStatistics( - serializedBytes, completedStatisticsTypeSerializer); - assertThat(completedStatistics).isEqualTo(data[0]); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataDistributionUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataDistributionUtil.java deleted file mode 100644 index a9dd1b5d8173..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataDistributionUtil.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.DataDistributionUtil.binarySearchIndex; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import org.junit.jupiter.api.Test; - -public class TestDataDistributionUtil { - @Test - public void testBinarySearchIndex() { - long[] weightsUDF = {10, 20, 30, 40, 50}; - assertThat(binarySearchIndex(weightsUDF, 0)).isEqualTo(0); - assertThat(binarySearchIndex(weightsUDF, 9)).isEqualTo(0); - assertThat(binarySearchIndex(weightsUDF, 10)).isEqualTo(1); - assertThat(binarySearchIndex(weightsUDF, 15)).isEqualTo(1); - assertThat(binarySearchIndex(weightsUDF, 20)).isEqualTo(2); - assertThat(binarySearchIndex(weightsUDF, 29)).isEqualTo(2); - assertThat(binarySearchIndex(weightsUDF, 30)).isEqualTo(3); - assertThat(binarySearchIndex(weightsUDF, 31)).isEqualTo(3); - assertThat(binarySearchIndex(weightsUDF, 40)).isEqualTo(4); - - // Test with a target that is out of range - assertThatThrownBy(() -> binarySearchIndex(weightsUDF, -1)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("target weight must be non-negative"); - assertThatThrownBy(() -> binarySearchIndex(weightsUDF, 50)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("target weight is out of range"); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java deleted file mode 100644 index 0a6caf2aaa98..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java +++ /dev/null @@ -1,373 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.NUM_SUBTASKS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.time.Duration; -import java.util.Map; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ExecutionException; -import org.apache.flink.runtime.jobgraph.OperatorID; -import org.apache.flink.runtime.operators.coordination.EventReceivingTasks; -import org.apache.flink.runtime.operators.coordination.MockOperatorCoordinatorContext; -import org.apache.flink.util.ExceptionUtils; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.EnumSource; - -public class TestDataStatisticsCoordinator { - private static final String OPERATOR_NAME = "TestCoordinator"; - private static final OperatorID TEST_OPERATOR_ID = new OperatorID(1234L, 5678L); - - private EventReceivingTasks receivingTasks; - - @BeforeEach - public void before() throws Exception { - receivingTasks = EventReceivingTasks.createForRunningTasks(); - } - - private void tasksReady(DataStatisticsCoordinator coordinator) { - setAllTasksReady(NUM_SUBTASKS, coordinator, receivingTasks); - } - - @ParameterizedTest - @EnumSource(StatisticsType.class) - public void testThrowExceptionWhenNotStarted(StatisticsType type) throws Exception { - try (DataStatisticsCoordinator dataStatisticsCoordinator = createCoordinator(type)) { - String failureMessage = "The coordinator of TestCoordinator has not started yet."; - assertThatThrownBy( - () -> - dataStatisticsCoordinator.handleEventFromOperator( - 0, - 0, - StatisticsEvent.createTaskStatisticsEvent( - 0, new MapDataStatistics(), Fixtures.TASK_STATISTICS_SERIALIZER))) - .isInstanceOf(IllegalStateException.class) - .hasMessage(failureMessage); - assertThatThrownBy(() -> dataStatisticsCoordinator.executionAttemptFailed(0, 0, null)) - .isInstanceOf(IllegalStateException.class) - .hasMessage(failureMessage); - assertThatThrownBy(() -> dataStatisticsCoordinator.checkpointCoordinator(0, null)) - .isInstanceOf(IllegalStateException.class) - .hasMessage(failureMessage); - } - } - - @ParameterizedTest - @EnumSource(StatisticsType.class) - public void testDataStatisticsEventHandling(StatisticsType type) throws Exception { - try (DataStatisticsCoordinator dataStatisticsCoordinator = createCoordinator(type)) { - dataStatisticsCoordinator.start(); - tasksReady(dataStatisticsCoordinator); - - StatisticsEvent checkpoint1Subtask0DataStatisticEvent = - Fixtures.createStatisticsEvent( - type, - Fixtures.TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c")); - StatisticsEvent checkpoint1Subtask1DataStatisticEvent = - Fixtures.createStatisticsEvent( - type, - Fixtures.TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c")); - // Handle events from operators for checkpoint 1 - dataStatisticsCoordinator.handleEventFromOperator( - 0, 0, checkpoint1Subtask0DataStatisticEvent); - dataStatisticsCoordinator.handleEventFromOperator( - 1, 0, checkpoint1Subtask1DataStatisticEvent); - - waitForCoordinatorToProcessActions(dataStatisticsCoordinator); - - Map keyFrequency = - ImmutableMap.of( - CHAR_KEYS.get("a"), 2L, - CHAR_KEYS.get("b"), 3L, - CHAR_KEYS.get("c"), 5L); - MapAssignment mapAssignment = - MapAssignment.fromKeyFrequency(NUM_SUBTASKS, keyFrequency, 0.0d, SORT_ORDER_COMPARTOR); - - CompletedStatistics completedStatistics = dataStatisticsCoordinator.completedStatistics(); - assertThat(completedStatistics.checkpointId()).isEqualTo(1L); - assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(completedStatistics.keyFrequency()).isEqualTo(keyFrequency); - } else { - assertThat(completedStatistics.keySamples()) - .containsExactly( - CHAR_KEYS.get("a"), - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c")); - } - - GlobalStatistics globalStatistics = dataStatisticsCoordinator.globalStatistics(); - assertThat(globalStatistics.checkpointId()).isEqualTo(1L); - assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(globalStatistics.mapAssignment()).isEqualTo(mapAssignment); - } else { - assertThat(globalStatistics.rangeBounds()).containsExactly(CHAR_KEYS.get("b")); - } - } - } - - @ParameterizedTest - @EnumSource(StatisticsType.class) - public void testDataStatisticsEventHandlingWithNullValue(StatisticsType type) throws Exception { - try (DataStatisticsCoordinator dataStatisticsCoordinator = createCoordinator(type)) { - dataStatisticsCoordinator.start(); - tasksReady(dataStatisticsCoordinator); - - SortKey nullSortKey = Fixtures.SORT_KEY.copy(); - nullSortKey.set(0, null); - - StatisticsEvent checkpoint1Subtask0DataStatisticEvent = - Fixtures.createStatisticsEvent( - type, - Fixtures.TASK_STATISTICS_SERIALIZER, - 1L, - nullSortKey, - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c")); - StatisticsEvent checkpoint1Subtask1DataStatisticEvent = - Fixtures.createStatisticsEvent( - type, - Fixtures.TASK_STATISTICS_SERIALIZER, - 1L, - nullSortKey, - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c")); - // Handle events from operators for checkpoint 1 - dataStatisticsCoordinator.handleEventFromOperator( - 0, 0, checkpoint1Subtask0DataStatisticEvent); - dataStatisticsCoordinator.handleEventFromOperator( - 1, 0, checkpoint1Subtask1DataStatisticEvent); - - waitForCoordinatorToProcessActions(dataStatisticsCoordinator); - - Map keyFrequency = - ImmutableMap.of(nullSortKey, 2L, CHAR_KEYS.get("b"), 3L, CHAR_KEYS.get("c"), 5L); - MapAssignment mapAssignment = - MapAssignment.fromKeyFrequency(NUM_SUBTASKS, keyFrequency, 0.0d, SORT_ORDER_COMPARTOR); - - CompletedStatistics completedStatistics = dataStatisticsCoordinator.completedStatistics(); - assertThat(completedStatistics.checkpointId()).isEqualTo(1L); - assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(completedStatistics.keyFrequency()).isEqualTo(keyFrequency); - } else { - assertThat(completedStatistics.keySamples()) - .containsExactly( - nullSortKey, - nullSortKey, - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c")); - } - - GlobalStatistics globalStatistics = dataStatisticsCoordinator.globalStatistics(); - assertThat(globalStatistics.checkpointId()).isEqualTo(1L); - assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(globalStatistics.mapAssignment()).isEqualTo(mapAssignment); - } else { - assertThat(globalStatistics.rangeBounds()).containsExactly(CHAR_KEYS.get("b")); - } - } - } - - @Test - public void testRequestGlobalStatisticsEventHandling() throws Exception { - try (DataStatisticsCoordinator dataStatisticsCoordinator = - createCoordinator(StatisticsType.Sketch)) { - dataStatisticsCoordinator.start(); - tasksReady(dataStatisticsCoordinator); - - // receive request before global statistics is ready - dataStatisticsCoordinator.handleEventFromOperator(0, 0, new RequestGlobalStatisticsEvent()); - assertThat(receivingTasks.getSentEventsForSubtask(0)).isEmpty(); - assertThat(receivingTasks.getSentEventsForSubtask(1)).isEmpty(); - - StatisticsEvent checkpoint1Subtask0DataStatisticEvent = - Fixtures.createStatisticsEvent( - StatisticsType.Sketch, Fixtures.TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("a")); - StatisticsEvent checkpoint1Subtask1DataStatisticEvent = - Fixtures.createStatisticsEvent( - StatisticsType.Sketch, Fixtures.TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("b")); - // Handle events from operators for checkpoint 1 - dataStatisticsCoordinator.handleEventFromOperator( - 0, 0, checkpoint1Subtask0DataStatisticEvent); - dataStatisticsCoordinator.handleEventFromOperator( - 1, 0, checkpoint1Subtask1DataStatisticEvent); - - waitForCoordinatorToProcessActions(dataStatisticsCoordinator); - Awaitility.await("wait for statistics event") - .pollInterval(Duration.ofMillis(10)) - .atMost(Duration.ofSeconds(10)) - .until(() -> receivingTasks.getSentEventsForSubtask(0).size() == 1); - assertThat(receivingTasks.getSentEventsForSubtask(0).get(0)) - .isInstanceOf(StatisticsEvent.class); - - Awaitility.await("wait for statistics event") - .pollInterval(Duration.ofMillis(10)) - .atMost(Duration.ofSeconds(10)) - .until(() -> receivingTasks.getSentEventsForSubtask(1).size() == 1); - assertThat(receivingTasks.getSentEventsForSubtask(1).get(0)) - .isInstanceOf(StatisticsEvent.class); - - dataStatisticsCoordinator.handleEventFromOperator(1, 0, new RequestGlobalStatisticsEvent()); - - // coordinator should send a response to subtask 1 - Awaitility.await("wait for statistics event") - .pollInterval(Duration.ofMillis(10)) - .atMost(Duration.ofSeconds(10)) - .until(() -> receivingTasks.getSentEventsForSubtask(1).size() == 2); - assertThat(receivingTasks.getSentEventsForSubtask(1).get(0)) - .isInstanceOf(StatisticsEvent.class); - assertThat(receivingTasks.getSentEventsForSubtask(1).get(1)) - .isInstanceOf(StatisticsEvent.class); - } - } - - @Test - public void testMultipleRequestGlobalStatisticsEvents() throws Exception { - try (DataStatisticsCoordinator dataStatisticsCoordinator = - createCoordinator(StatisticsType.Map)) { - dataStatisticsCoordinator.start(); - tasksReady(dataStatisticsCoordinator); - - StatisticsEvent checkpoint1Subtask0DataStatisticEvent = - Fixtures.createStatisticsEvent( - StatisticsType.Map, Fixtures.TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("a")); - StatisticsEvent checkpoint1Subtask1DataStatisticEvent = - Fixtures.createStatisticsEvent( - StatisticsType.Map, Fixtures.TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("b")); - - dataStatisticsCoordinator.handleEventFromOperator( - 0, 0, checkpoint1Subtask0DataStatisticEvent); - dataStatisticsCoordinator.handleEventFromOperator( - 1, 0, checkpoint1Subtask1DataStatisticEvent); - - waitForCoordinatorToProcessActions(dataStatisticsCoordinator); - - // signature is null - dataStatisticsCoordinator.handleEventFromOperator(0, 0, new RequestGlobalStatisticsEvent()); - - // Checkpoint StatisticEvent + RequestGlobalStatisticsEvent - Awaitility.await("wait for first statistics event") - .pollInterval(Duration.ofMillis(10)) - .atMost(Duration.ofSeconds(10)) - .until(() -> receivingTasks.getSentEventsForSubtask(0).size() == 2); - - // Simulate the scenario where a subtask send global statistics request with the same hash - // code. The coordinator would skip the response after comparing the request contained hash - // code with latest global statistics hash code. - int correctSignature = dataStatisticsCoordinator.globalStatistics().hashCode(); - dataStatisticsCoordinator.handleEventFromOperator( - 0, 0, new RequestGlobalStatisticsEvent(correctSignature)); - - waitForCoordinatorToProcessActions(dataStatisticsCoordinator); - // Checkpoint StatisticEvent + RequestGlobalStatisticsEvent - assertThat(receivingTasks.getSentEventsForSubtask(0).size()).isEqualTo(2); - - // signature is different - dataStatisticsCoordinator.handleEventFromOperator( - 0, 0, new RequestGlobalStatisticsEvent(correctSignature + 1)); - - // Checkpoint StatisticEvent + RequestGlobalStatisticsEvent + RequestGlobalStatisticsEvent - Awaitility.await("wait for second statistics event") - .pollInterval(Duration.ofMillis(10)) - .atMost(Duration.ofSeconds(10)) - .until(() -> receivingTasks.getSentEventsForSubtask(0).size() == 3); - } - } - - static void setAllTasksReady( - int subtasks, - DataStatisticsCoordinator dataStatisticsCoordinator, - EventReceivingTasks receivingTasks) { - for (int i = 0; i < subtasks; i++) { - dataStatisticsCoordinator.executionAttemptReady( - i, 0, receivingTasks.createGatewayForSubtask(i, 0)); - } - } - - static void waitForCoordinatorToProcessActions(DataStatisticsCoordinator coordinator) { - CompletableFuture future = new CompletableFuture<>(); - coordinator.callInCoordinatorThread( - () -> { - future.complete(null); - return null; - }, - "Coordinator fails to process action"); - - try { - future.get(); - } catch (InterruptedException e) { - throw new AssertionError("test interrupted"); - } catch (ExecutionException e) { - ExceptionUtils.rethrow(ExceptionUtils.stripExecutionException(e)); - } - } - - private static DataStatisticsCoordinator createCoordinator(StatisticsType type) { - return new DataStatisticsCoordinator( - OPERATOR_NAME, - new MockOperatorCoordinatorContext(TEST_OPERATOR_ID, NUM_SUBTASKS), - Fixtures.SCHEMA, - Fixtures.SORT_ORDER, - NUM_SUBTASKS, - type, - 0.0d); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java deleted file mode 100644 index 6317f2bfde18..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java +++ /dev/null @@ -1,187 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.TASK_STATISTICS_SERIALIZER; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.createStatisticsEvent; -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.Map; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ExecutionException; -import org.apache.flink.runtime.jobgraph.OperatorID; -import org.apache.flink.runtime.operators.coordination.EventReceivingTasks; -import org.apache.flink.runtime.operators.coordination.MockOperatorCoordinatorContext; -import org.apache.flink.runtime.operators.coordination.RecreateOnResetOperatorCoordinator; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.EnumSource; - -public class TestDataStatisticsCoordinatorProvider { - private static final OperatorID OPERATOR_ID = new OperatorID(); - - private EventReceivingTasks receivingTasks; - - @BeforeEach - public void before() { - receivingTasks = EventReceivingTasks.createForRunningTasks(); - } - - @ParameterizedTest - @EnumSource(StatisticsType.class) - public void testCheckpointAndReset(StatisticsType type) throws Exception { - DataStatisticsCoordinatorProvider provider = createProvider(type, Fixtures.NUM_SUBTASKS); - try (RecreateOnResetOperatorCoordinator coordinator = - (RecreateOnResetOperatorCoordinator) - provider.create( - new MockOperatorCoordinatorContext(OPERATOR_ID, Fixtures.NUM_SUBTASKS))) { - DataStatisticsCoordinator dataStatisticsCoordinator = - (DataStatisticsCoordinator) coordinator.getInternalCoordinator(); - - // Start the coordinator - coordinator.start(); - TestDataStatisticsCoordinator.setAllTasksReady( - Fixtures.NUM_SUBTASKS, dataStatisticsCoordinator, receivingTasks); - - // Handle events from operators for checkpoint 1 - StatisticsEvent checkpoint1Subtask0StatisticsEvent = - createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("a")); - coordinator.handleEventFromOperator(0, 0, checkpoint1Subtask0StatisticsEvent); - TestDataStatisticsCoordinator.waitForCoordinatorToProcessActions(dataStatisticsCoordinator); - - StatisticsEvent checkpoint1Subtask1StatisticsEvent = - createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("b")); - coordinator.handleEventFromOperator(1, 0, checkpoint1Subtask1StatisticsEvent); - TestDataStatisticsCoordinator.waitForCoordinatorToProcessActions(dataStatisticsCoordinator); - - // Verify checkpoint 1 global data statistics - Map checkpoint1KeyFrequency = - ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 1L); - MapAssignment checkpoint1MapAssignment = - MapAssignment.fromKeyFrequency( - Fixtures.NUM_SUBTASKS, checkpoint1KeyFrequency, 0.0d, SORT_ORDER_COMPARTOR); - - CompletedStatistics completedStatistics = dataStatisticsCoordinator.completedStatistics(); - assertThat(completedStatistics).isNotNull(); - assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(completedStatistics.keyFrequency()).isEqualTo(checkpoint1KeyFrequency); - } else { - assertThat(completedStatistics.keySamples()) - .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); - } - - GlobalStatistics globalStatistics = dataStatisticsCoordinator.globalStatistics(); - assertThat(globalStatistics).isNotNull(); - assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(globalStatistics.mapAssignment()).isEqualTo(checkpoint1MapAssignment); - } else { - assertThat(globalStatistics.rangeBounds()).containsExactly(CHAR_KEYS.get("a")); - } - - byte[] checkpoint1Bytes = waitForCheckpoint(1L, dataStatisticsCoordinator); - - StatisticsEvent checkpoint2Subtask0StatisticsEvent = - createStatisticsEvent( - type, TASK_STATISTICS_SERIALIZER, 2L, CHAR_KEYS.get("d"), CHAR_KEYS.get("e")); - coordinator.handleEventFromOperator(0, 0, checkpoint2Subtask0StatisticsEvent); - TestDataStatisticsCoordinator.waitForCoordinatorToProcessActions(dataStatisticsCoordinator); - - StatisticsEvent checkpoint2Subtask1StatisticsEvent = - createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 2L, CHAR_KEYS.get("f")); - coordinator.handleEventFromOperator(1, 0, checkpoint2Subtask1StatisticsEvent); - TestDataStatisticsCoordinator.waitForCoordinatorToProcessActions(dataStatisticsCoordinator); - - // Verify checkpoint 2 global data statistics - Map checkpoint2KeyFrequency = - ImmutableMap.of(CHAR_KEYS.get("d"), 1L, CHAR_KEYS.get("e"), 1L, CHAR_KEYS.get("f"), 1L); - MapAssignment checkpoint2MapAssignment = - MapAssignment.fromKeyFrequency( - Fixtures.NUM_SUBTASKS, checkpoint2KeyFrequency, 0.0d, SORT_ORDER_COMPARTOR); - completedStatistics = dataStatisticsCoordinator.completedStatistics(); - assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(completedStatistics.keyFrequency()).isEqualTo(checkpoint2KeyFrequency); - } else { - assertThat(completedStatistics.keySamples()) - .containsExactly(CHAR_KEYS.get("d"), CHAR_KEYS.get("e"), CHAR_KEYS.get("f")); - } - - globalStatistics = dataStatisticsCoordinator.globalStatistics(); - assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(globalStatistics.mapAssignment()).isEqualTo(checkpoint2MapAssignment); - } else { - assertThat(globalStatistics.rangeBounds()).containsExactly(CHAR_KEYS.get("e")); - } - - waitForCheckpoint(2L, dataStatisticsCoordinator); - - // Reset coordinator to checkpoint 1 - coordinator.resetToCheckpoint(1L, checkpoint1Bytes); - DataStatisticsCoordinator restoredDataStatisticsCoordinator = - (DataStatisticsCoordinator) coordinator.getInternalCoordinator(); - assertThat(dataStatisticsCoordinator).isNotSameAs(restoredDataStatisticsCoordinator); - - completedStatistics = restoredDataStatisticsCoordinator.completedStatistics(); - assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - // Verify restored data statistics - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(completedStatistics.keyFrequency()) - .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 1L)); - } else { - assertThat(completedStatistics.keySamples()) - .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); - } - - globalStatistics = restoredDataStatisticsCoordinator.globalStatistics(); - assertThat(globalStatistics).isNotNull(); - assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(globalStatistics.mapAssignment()).isEqualTo(checkpoint1MapAssignment); - } else { - assertThat(globalStatistics.rangeBounds()).containsExactly(CHAR_KEYS.get("a")); - } - } - } - - private byte[] waitForCheckpoint(long checkpointId, DataStatisticsCoordinator coordinator) - throws InterruptedException, ExecutionException { - CompletableFuture future = new CompletableFuture<>(); - coordinator.checkpointCoordinator(checkpointId, future); - return future.get(); - } - - private static DataStatisticsCoordinatorProvider createProvider( - StatisticsType type, int downstreamParallelism) { - return new DataStatisticsCoordinatorProvider( - "DataStatisticsCoordinatorProvider", - OPERATOR_ID, - Fixtures.SCHEMA, - Fixtures.SORT_ORDER, - downstreamParallelism, - type, - 0.0); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java deleted file mode 100644 index f7a7a147e73a..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java +++ /dev/null @@ -1,382 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; -import static org.assertj.core.api.Assertions.assertThat; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.Mockito.verify; - -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import org.apache.datasketches.sampling.ReservoirItemsSketch; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.common.state.OperatorStateStore; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.core.fs.CloseableRegistry; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; -import org.apache.flink.runtime.execution.Environment; -import org.apache.flink.runtime.operators.coordination.MockOperatorEventGateway; -import org.apache.flink.runtime.operators.testutils.MockInputSplitProvider; -import org.apache.flink.runtime.state.AbstractStateBackend; -import org.apache.flink.runtime.state.OperatorStateBackendParametersImpl; -import org.apache.flink.runtime.state.StateInitializationContext; -import org.apache.flink.runtime.state.StateInitializationContextImpl; -import org.apache.flink.runtime.state.TestTaskStateManager; -import org.apache.flink.runtime.state.hashmap.HashMapStateBackend; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.streaming.runtime.tasks.OneInputStreamTask; -import org.apache.flink.streaming.runtime.tasks.StreamMockEnvironment; -import org.apache.flink.streaming.util.MockOutput; -import org.apache.flink.streaming.util.MockStreamConfig; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.Arguments; -import org.junit.jupiter.params.provider.EnumSource; -import org.junit.jupiter.params.provider.MethodSource; -import org.mockito.Mockito; - -public class TestDataStatisticsOperator { - - private Environment env; - - @BeforeEach - public void before() throws Exception { - this.env = - new StreamMockEnvironment( - new Configuration(), - new Configuration(), - new ExecutionConfig(), - 1L, - new MockInputSplitProvider(), - 1, - new TestTaskStateManager()); - } - - private DataStatisticsOperator createOperator(StatisticsType type, int downstreamParallelism) - throws Exception { - MockOperatorEventGateway mockGateway = new MockOperatorEventGateway(); - return createOperator(type, downstreamParallelism, mockGateway); - } - - private DataStatisticsOperator createOperator( - StatisticsType type, int downstreamParallelism, MockOperatorEventGateway mockGateway) - throws Exception { - DataStatisticsOperator operator = - new DataStatisticsOperator( - "testOperator", - Fixtures.SCHEMA, - Fixtures.SORT_ORDER, - mockGateway, - downstreamParallelism, - type); - operator.setup( - new OneInputStreamTask(env), - new MockStreamConfig(new Configuration(), 1), - new MockOutput<>(Lists.newArrayList())); - return operator; - } - - @SuppressWarnings("unchecked") - @ParameterizedTest - @EnumSource(StatisticsType.class) - public void testProcessElement(StatisticsType type) throws Exception { - DataStatisticsOperator operator = createOperator(type, Fixtures.NUM_SUBTASKS); - try (OneInputStreamOperatorTestHarness testHarness = - createHarness(operator)) { - StateInitializationContext stateContext = getStateContext(); - operator.initializeState(stateContext); - operator.processElement(new StreamRecord<>(GenericRowData.of(StringData.fromString("a"), 5))); - operator.processElement(new StreamRecord<>(GenericRowData.of(StringData.fromString("a"), 3))); - operator.processElement(new StreamRecord<>(GenericRowData.of(StringData.fromString("b"), 1))); - - DataStatistics localStatistics = operator.localStatistics(); - assertThat(localStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - Map keyFrequency = (Map) localStatistics.result(); - assertThat(keyFrequency) - .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 2L, CHAR_KEYS.get("b"), 1L)); - } else { - ReservoirItemsSketch sketch = - (ReservoirItemsSketch) localStatistics.result(); - assertThat(sketch.getSamples()) - .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); - } - - testHarness.endInput(); - } - } - - @ParameterizedTest - @EnumSource(StatisticsType.class) - public void testProcessElementWithNull(StatisticsType type) throws Exception { - DataStatisticsOperator operator = createOperator(type, Fixtures.NUM_SUBTASKS); - try (OneInputStreamOperatorTestHarness testHarness = - createHarness(operator)) { - StateInitializationContext stateContext = getStateContext(); - operator.initializeState(stateContext); - operator.processElement(new StreamRecord<>(GenericRowData.of(null, 5))); - operator.processElement(new StreamRecord<>(GenericRowData.of(StringData.fromString("a"), 3))); - - DataStatistics localStatistics = operator.localStatistics(); - SortKeySerializer sortKeySerializer = - new SortKeySerializer(Fixtures.SCHEMA, Fixtures.SORT_ORDER); - DataStatisticsSerializer taskStatisticsSerializer = - new DataStatisticsSerializer(sortKeySerializer); - DataOutputSerializer outputView = new DataOutputSerializer(1024); - - taskStatisticsSerializer.serialize(localStatistics, outputView); - DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); - DataStatistics dataStatistics = taskStatisticsSerializer.deserialize(inputView); - - testHarness.endInput(); - - assertThat(localStatistics).isEqualTo(dataStatistics); - } - } - - @ParameterizedTest - @EnumSource(StatisticsType.class) - public void testOperatorOutput(StatisticsType type) throws Exception { - DataStatisticsOperator operator = createOperator(type, Fixtures.NUM_SUBTASKS); - try (OneInputStreamOperatorTestHarness testHarness = - createHarness(operator)) { - testHarness.processElement( - new StreamRecord<>(GenericRowData.of(StringData.fromString("a"), 2))); - testHarness.processElement( - new StreamRecord<>(GenericRowData.of(StringData.fromString("b"), 3))); - testHarness.processElement( - new StreamRecord<>(GenericRowData.of(StringData.fromString("b"), 1))); - - List recordsOutput = - testHarness.extractOutputValues().stream() - .filter(StatisticsOrRecord::hasRecord) - .map(StatisticsOrRecord::record) - .collect(Collectors.toList()); - assertThat(recordsOutput) - .containsExactlyInAnyOrderElementsOf( - ImmutableList.of( - GenericRowData.of(StringData.fromString("a"), 2), - GenericRowData.of(StringData.fromString("b"), 3), - GenericRowData.of(StringData.fromString("b"), 1))); - } - } - - private static Stream provideRestoreStateParameters() { - return Stream.of( - Arguments.of(StatisticsType.Map, -1), - Arguments.of(StatisticsType.Map, 0), - Arguments.of(StatisticsType.Map, 1), - Arguments.of(StatisticsType.Sketch, -1), - Arguments.of(StatisticsType.Sketch, 0), - Arguments.of(StatisticsType.Sketch, 1)); - } - - @ParameterizedTest - @MethodSource("provideRestoreStateParameters") - public void testRestoreState(StatisticsType type, int parallelismAdjustment) throws Exception { - Map keyFrequency = - ImmutableMap.of(CHAR_KEYS.get("a"), 2L, CHAR_KEYS.get("b"), 1L, CHAR_KEYS.get("c"), 1L); - SortKey[] rangeBounds = new SortKey[] {CHAR_KEYS.get("a")}; - MapAssignment mapAssignment = - MapAssignment.fromKeyFrequency(2, keyFrequency, 0.0d, SORT_ORDER_COMPARTOR); - DataStatisticsOperator operator = createOperator(type, Fixtures.NUM_SUBTASKS); - OperatorSubtaskState snapshot; - try (OneInputStreamOperatorTestHarness testHarness1 = - createHarness(operator)) { - GlobalStatistics statistics; - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - statistics = GlobalStatistics.fromMapAssignment(1L, mapAssignment); - } else { - statistics = GlobalStatistics.fromRangeBounds(1L, rangeBounds); - } - - StatisticsEvent event = - StatisticsEvent.createGlobalStatisticsEvent( - statistics, Fixtures.GLOBAL_STATISTICS_SERIALIZER, false); - operator.handleOperatorEvent(event); - - GlobalStatistics globalStatistics = operator.globalStatistics(); - assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(globalStatistics.mapAssignment()).isEqualTo(mapAssignment); - assertThat(globalStatistics.rangeBounds()).isNull(); - } else { - assertThat(globalStatistics.mapAssignment()).isNull(); - assertThat(globalStatistics.rangeBounds()).isEqualTo(rangeBounds); - } - - snapshot = testHarness1.snapshot(1L, 0); - } - - // Use the snapshot to initialize state for another new operator and then verify that the global - // statistics for the new operator is same as before - MockOperatorEventGateway spyGateway = Mockito.spy(new MockOperatorEventGateway()); - DataStatisticsOperator restoredOperator = - createOperator(type, Fixtures.NUM_SUBTASKS + parallelismAdjustment, spyGateway); - try (OneInputStreamOperatorTestHarness testHarness2 = - new OneInputStreamOperatorTestHarness<>(restoredOperator, 2, 2, 1)) { - testHarness2.setup(); - testHarness2.initializeState(snapshot); - - GlobalStatistics globalStatistics = restoredOperator.globalStatistics(); - // global statistics is always restored and used initially even if - // downstream parallelism changed. - assertThat(globalStatistics).isNotNull(); - // request is always sent to coordinator during initialization. - // coordinator would respond with a new global statistics that - // has range bound recomputed with new parallelism. - verify(spyGateway).sendEventToCoordinator(any(RequestGlobalStatisticsEvent.class)); - assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(globalStatistics.mapAssignment()).isEqualTo(mapAssignment); - assertThat(globalStatistics.rangeBounds()).isNull(); - } else { - assertThat(globalStatistics.mapAssignment()).isNull(); - assertThat(globalStatistics.rangeBounds()).isEqualTo(rangeBounds); - } - } - } - - @SuppressWarnings("unchecked") - @Test - public void testMigrationWithLocalStatsOverThreshold() throws Exception { - DataStatisticsOperator operator = createOperator(StatisticsType.Auto, Fixtures.NUM_SUBTASKS); - try (OneInputStreamOperatorTestHarness testHarness = - createHarness(operator)) { - StateInitializationContext stateContext = getStateContext(); - operator.initializeState(stateContext); - - // add rows with unique keys - for (int i = 0; i < SketchUtil.OPERATOR_SKETCH_SWITCH_THRESHOLD; ++i) { - operator.processElement( - new StreamRecord<>(GenericRowData.of(StringData.fromString(String.valueOf(i)), i))); - assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Map); - assertThat((Map) operator.localStatistics().result()).hasSize(i + 1); - } - - // one more item should trigger the migration to sketch stats - operator.processElement( - new StreamRecord<>(GenericRowData.of(StringData.fromString("key-trigger-migration"), 1))); - - int reservoirSize = - SketchUtil.determineOperatorReservoirSize(Fixtures.NUM_SUBTASKS, Fixtures.NUM_SUBTASKS); - - assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Sketch); - ReservoirItemsSketch sketch = - (ReservoirItemsSketch) operator.localStatistics().result(); - assertThat(sketch.getK()).isEqualTo(reservoirSize); - assertThat(sketch.getN()).isEqualTo(SketchUtil.OPERATOR_SKETCH_SWITCH_THRESHOLD + 1); - // reservoir not full yet - assertThat(sketch.getN()).isLessThan(reservoirSize); - assertThat(sketch.getSamples()).hasSize((int) sketch.getN()); - - // add more items to saturate the reservoir - for (int i = 0; i < reservoirSize; ++i) { - operator.processElement( - new StreamRecord<>(GenericRowData.of(StringData.fromString(String.valueOf(i)), i))); - } - - assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Sketch); - sketch = (ReservoirItemsSketch) operator.localStatistics().result(); - assertThat(sketch.getK()).isEqualTo(reservoirSize); - assertThat(sketch.getN()) - .isEqualTo(SketchUtil.OPERATOR_SKETCH_SWITCH_THRESHOLD + 1 + reservoirSize); - // reservoir is full now - assertThat(sketch.getN()).isGreaterThan(reservoirSize); - assertThat(sketch.getSamples()).hasSize(reservoirSize); - - testHarness.endInput(); - } - } - - @SuppressWarnings("unchecked") - @Test - public void testMigrationWithGlobalSketchStatistics() throws Exception { - DataStatisticsOperator operator = createOperator(StatisticsType.Auto, Fixtures.NUM_SUBTASKS); - try (OneInputStreamOperatorTestHarness testHarness = - createHarness(operator)) { - StateInitializationContext stateContext = getStateContext(); - operator.initializeState(stateContext); - - // started with Map stype - operator.processElement(new StreamRecord<>(GenericRowData.of(StringData.fromString("a"), 1))); - assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Map); - assertThat((Map) operator.localStatistics().result()) - .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L)); - - // received global statistics with sketch type - GlobalStatistics globalStatistics = - GlobalStatistics.fromRangeBounds( - 1L, new SortKey[] {CHAR_KEYS.get("c"), CHAR_KEYS.get("f")}); - operator.handleOperatorEvent( - StatisticsEvent.createGlobalStatisticsEvent( - globalStatistics, Fixtures.GLOBAL_STATISTICS_SERIALIZER, false)); - - int reservoirSize = - SketchUtil.determineOperatorReservoirSize(Fixtures.NUM_SUBTASKS, Fixtures.NUM_SUBTASKS); - - assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Sketch); - ReservoirItemsSketch sketch = - (ReservoirItemsSketch) operator.localStatistics().result(); - assertThat(sketch.getK()).isEqualTo(reservoirSize); - assertThat(sketch.getN()).isEqualTo(1); - assertThat(sketch.getSamples()).isEqualTo(new SortKey[] {CHAR_KEYS.get("a")}); - - testHarness.endInput(); - } - } - - private StateInitializationContext getStateContext() throws Exception { - AbstractStateBackend abstractStateBackend = new HashMapStateBackend(); - CloseableRegistry cancelStreamRegistry = new CloseableRegistry(); - OperatorStateStore operatorStateStore = - abstractStateBackend.createOperatorStateBackend( - new OperatorStateBackendParametersImpl( - env, "test-operator", Collections.emptyList(), cancelStreamRegistry)); - return new StateInitializationContextImpl(null, operatorStateStore, null, null, null); - } - - private OneInputStreamOperatorTestHarness createHarness( - DataStatisticsOperator dataStatisticsOperator) throws Exception { - OneInputStreamOperatorTestHarness harness = - new OneInputStreamOperatorTestHarness<>( - dataStatisticsOperator, Fixtures.NUM_SUBTASKS, Fixtures.NUM_SUBTASKS, 0); - harness.setup( - new StatisticsOrRecordSerializer( - Fixtures.GLOBAL_STATISTICS_SERIALIZER, Fixtures.ROW_SERIALIZER)); - harness.open(); - return harness; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java deleted file mode 100644 index 59ce6df05d9d..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; - -import org.apache.flink.api.common.typeutils.SerializerTestBase; -import org.apache.flink.api.common.typeutils.TypeSerializer; - -public class TestDataStatisticsSerializer extends SerializerTestBase { - @Override - protected TypeSerializer createSerializer() { - return Fixtures.TASK_STATISTICS_SERIALIZER; - } - - @Override - protected int getLength() { - return -1; - } - - @Override - protected Class getTypeClass() { - return DataStatistics.class; - } - - @Override - protected DataStatistics[] getTestData() { - return new DataStatistics[] { - new MapDataStatistics(), - Fixtures.createTaskStatistics( - StatisticsType.Map, CHAR_KEYS.get("a"), CHAR_KEYS.get("a"), CHAR_KEYS.get("b")), - new SketchDataStatistics(128), - Fixtures.createTaskStatistics( - StatisticsType.Sketch, CHAR_KEYS.get("a"), CHAR_KEYS.get("a"), CHAR_KEYS.get("b")) - }; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java deleted file mode 100644 index 7afaf239c668..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; - -import org.apache.flink.api.common.typeutils.SerializerTestBase; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; - -public class TestGlobalStatisticsSerializer extends SerializerTestBase { - - @Override - protected TypeSerializer createSerializer() { - return Fixtures.GLOBAL_STATISTICS_SERIALIZER; - } - - @Override - protected int getLength() { - return -1; - } - - @Override - protected Class getTypeClass() { - return GlobalStatistics.class; - } - - @Override - protected GlobalStatistics[] getTestData() { - return new GlobalStatistics[] { - GlobalStatistics.fromMapAssignment( - 1L, - MapAssignment.fromKeyFrequency( - Fixtures.NUM_SUBTASKS, - ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L), - 0.0d, - SORT_ORDER_COMPARTOR)), - GlobalStatistics.fromRangeBounds(2L, new SortKey[] {CHAR_KEYS.get("a"), CHAR_KEYS.get("b")}) - }; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java deleted file mode 100644 index 8a25c7ad9898..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.ROW_WRAPPER; -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.Map; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.Test; - -public class TestMapDataStatistics { - @SuppressWarnings("unchecked") - @Test - public void testAddsAndGet() { - MapDataStatistics dataStatistics = new MapDataStatistics(); - - GenericRowData reusedRow = GenericRowData.of(StringData.fromString("a"), 1); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - reusedRow.setField(0, StringData.fromString("b")); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - reusedRow.setField(0, StringData.fromString("c")); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - reusedRow.setField(0, StringData.fromString("b")); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - reusedRow.setField(0, StringData.fromString("a")); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - reusedRow.setField(0, StringData.fromString("b")); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - Map actual = (Map) dataStatistics.result(); - Map expected = - ImmutableMap.of(CHAR_KEYS.get("a"), 2L, CHAR_KEYS.get("b"), 3L, CHAR_KEYS.get("c"), 1L); - assertThat(actual).isEqualTo(expected); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java deleted file mode 100644 index a59ed3b1c77b..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java +++ /dev/null @@ -1,436 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.atomic.AtomicLong; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.util.Pair; -import org.junit.jupiter.api.Test; - -public class TestMapRangePartitioner { - private static final SortOrder SORT_ORDER = - SortOrder.builderFor(TestFixtures.SCHEMA).asc("data").build(); - - private static final SortKey SORT_KEY = new SortKey(TestFixtures.SCHEMA, SORT_ORDER); - private static final RowType ROW_TYPE = FlinkSchemaUtil.convert(TestFixtures.SCHEMA); - private static final SortKey[] SORT_KEYS = initSortKeys(); - - private static SortKey[] initSortKeys() { - SortKey[] sortKeys = new SortKey[10]; - for (int i = 0; i < 10; ++i) { - RowData rowData = - GenericRowData.of(StringData.fromString("k" + i), i, StringData.fromString("2023-06-20")); - RowDataWrapper keyWrapper = new RowDataWrapper(ROW_TYPE, TestFixtures.SCHEMA.asStruct()); - keyWrapper.wrap(rowData); - SortKey sortKey = SORT_KEY.copy(); - sortKey.wrap(keyWrapper); - sortKeys[i] = sortKey; - } - return sortKeys; - } - - // Total weight is 800 - private final Map mapStatistics = - ImmutableMap.of( - SORT_KEYS[0], - 350L, - SORT_KEYS[1], - 230L, - SORT_KEYS[2], - 120L, - SORT_KEYS[3], - 40L, - SORT_KEYS[4], - 10L, - SORT_KEYS[5], - 10L, - SORT_KEYS[6], - 10L, - SORT_KEYS[7], - 10L, - SORT_KEYS[8], - 10L, - SORT_KEYS[9], - 10L); - - @Test - public void testEvenlyDividableNoClosingFileCost() { - int numPartitions = 8; - MapAssignment mapAssignment = - MapAssignment.fromKeyFrequency(numPartitions, mapStatistics, 0.0, SORT_ORDER_COMPARTOR); - - // each task should get targeted weight of 100 (=800/8) - Map expectedAssignment = - ImmutableMap.of( - SORT_KEYS[0], - new KeyAssignment( - ImmutableList.of(0, 1, 2, 3), ImmutableList.of(100L, 100L, 100L, 50L), 0L), - SORT_KEYS[1], - new KeyAssignment(ImmutableList.of(3, 4, 5), ImmutableList.of(50L, 100L, 80L), 0L), - SORT_KEYS[2], - new KeyAssignment(ImmutableList.of(5, 6), ImmutableList.of(20L, 100L), 0L), - SORT_KEYS[3], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(40L), 0L), - SORT_KEYS[4], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), - SORT_KEYS[5], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), - SORT_KEYS[6], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), - SORT_KEYS[7], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), - SORT_KEYS[8], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), - SORT_KEYS[9], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L)); - assertThat(mapAssignment).isEqualTo(new MapAssignment(numPartitions, expectedAssignment)); - - // key: subtask id - // value pair: first is the assigned weight, second is the number of assigned keys - Map> expectedAssignmentInfo = - ImmutableMap.of( - 0, - Pair.of(100L, 1), - 1, - Pair.of(100L, 1), - 2, - Pair.of(100L, 1), - 3, - Pair.of(100L, 2), - 4, - Pair.of(100L, 1), - 5, - Pair.of(100L, 2), - 6, - Pair.of(100L, 1), - 7, - Pair.of(100L, 7)); - assertThat(mapAssignment.assignmentInfo()).isEqualTo(expectedAssignmentInfo); - - MapRangePartitioner partitioner = - new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapAssignment); - Map>> partitionResults = - runPartitioner(partitioner, numPartitions, mapStatistics); - validatePartitionResults(expectedAssignmentInfo, partitionResults, 5.0); - } - - @Test - public void testEvenlyDividableWithClosingFileCost() { - int numPartitions = 8; - MapAssignment mapAssignment = - MapAssignment.fromKeyFrequency(numPartitions, mapStatistics, 5.0, SORT_ORDER_COMPARTOR); - - // target subtask weight is 100 before close file cost factored in. - // close file cost is 5 = 5% * 100. - // key weights before and after close file cost factored in - // before: 350, 230, 120, 40, 10, 10, 10, 10, 10, 10 - // close-cost: 20, 15, 10, 5, 5, 5, 5, 5, 5, 5 - // after: 370, 245, 130, 45, 15, 15, 15, 15, 15, 15 - // target subtask weight with close cost per subtask is 110 (880/8) - Map expectedAssignment = - ImmutableMap.of( - SORT_KEYS[0], - new KeyAssignment( - ImmutableList.of(0, 1, 2, 3), ImmutableList.of(110L, 110L, 110L, 40L), 5L), - SORT_KEYS[1], - new KeyAssignment(ImmutableList.of(3, 4, 5), ImmutableList.of(70L, 110L, 65L), 5L), - SORT_KEYS[2], - new KeyAssignment(ImmutableList.of(5, 6), ImmutableList.of(45L, 85L), 5L), - SORT_KEYS[3], - new KeyAssignment(ImmutableList.of(6, 7), ImmutableList.of(25L, 20L), 5L), - SORT_KEYS[4], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), - SORT_KEYS[5], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), - SORT_KEYS[6], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), - SORT_KEYS[7], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), - SORT_KEYS[8], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), - SORT_KEYS[9], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L)); - assertThat(mapAssignment.keyAssignments()).isEqualTo(expectedAssignment); - - // key: subtask id - // value pair: first is the assigned weight (excluding close file cost) for the subtask, - // second is the number of keys assigned to the subtask - Map> expectedAssignmentInfo = - ImmutableMap.of( - 0, - Pair.of(105L, 1), - 1, - Pair.of(105L, 1), - 2, - Pair.of(105L, 1), - 3, - Pair.of(100L, 2), - 4, - Pair.of(105L, 1), - 5, - Pair.of(100L, 2), - 6, - Pair.of(100L, 2), - 7, - Pair.of(75L, 7)); - assertThat(mapAssignment.assignmentInfo()).isEqualTo(expectedAssignmentInfo); - - MapRangePartitioner partitioner = - new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapAssignment); - Map>> partitionResults = - runPartitioner(partitioner, numPartitions, mapStatistics); - validatePartitionResults(expectedAssignmentInfo, partitionResults, 5.0); - } - - @Test - public void testNonDividableNoClosingFileCost() { - int numPartitions = 9; - MapAssignment mapAssignment = - MapAssignment.fromKeyFrequency(numPartitions, mapStatistics, 0.0, SORT_ORDER_COMPARTOR); - - // before: 350, 230, 120, 40, 10, 10, 10, 10, 10, 10 - // each task should get targeted weight of 89 = ceiling(800/9) - Map expectedAssignment = - ImmutableMap.of( - SORT_KEYS[0], - new KeyAssignment( - ImmutableList.of(0, 1, 2, 3), ImmutableList.of(89L, 89L, 89L, 83L), 0L), - SORT_KEYS[1], - new KeyAssignment( - ImmutableList.of(3, 4, 5, 6), ImmutableList.of(6L, 89L, 89L, 46L), 0L), - SORT_KEYS[2], - new KeyAssignment(ImmutableList.of(6, 7), ImmutableList.of(43L, 77L), 0L), - SORT_KEYS[3], - new KeyAssignment(ImmutableList.of(7, 8), ImmutableList.of(12L, 28L), 0L), - SORT_KEYS[4], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), - SORT_KEYS[5], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), - SORT_KEYS[6], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), - SORT_KEYS[7], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), - SORT_KEYS[8], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), - SORT_KEYS[9], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L)); - assertThat(mapAssignment.keyAssignments()).isEqualTo(expectedAssignment); - - // key: subtask id - // value pair: first is the assigned weight, second is the number of assigned keys - Map> expectedAssignmentInfo = - ImmutableMap.of( - 0, - Pair.of(89L, 1), - 1, - Pair.of(89L, 1), - 2, - Pair.of(89L, 1), - 3, - Pair.of(89L, 2), - 4, - Pair.of(89L, 1), - 5, - Pair.of(89L, 1), - 6, - Pair.of(89L, 2), - 7, - Pair.of(89L, 2), - 8, - Pair.of(88L, 7)); - assertThat(mapAssignment.assignmentInfo()).isEqualTo(expectedAssignmentInfo); - - MapRangePartitioner partitioner = - new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapAssignment); - Map>> partitionResults = - runPartitioner(partitioner, numPartitions, mapStatistics); - validatePartitionResults(expectedAssignmentInfo, partitionResults, 5.0); - } - - @Test - public void testNonDividableWithClosingFileCost() { - int numPartitions = 9; - MapAssignment mapAssignment = - MapAssignment.fromKeyFrequency(numPartitions, mapStatistics, 5.0, SORT_ORDER_COMPARTOR); - - // target subtask weight is 89 before close file cost factored in. - // close file cost is 5 (= 5% * 89) per file. - // key weights before and after close file cost factored in - // before: 350, 230, 120, 40, 10, 10, 10, 10, 10, 10 - // close-cost: 20, 15, 10, 5, 5, 5, 5, 5, 5, 5 - // after: 370, 245, 130, 45, 15, 15, 15, 15, 15, 15 - // target subtask weight per subtask is 98 ceiling(880/9) - Map expectedAssignment = - ImmutableMap.of( - SORT_KEYS[0], - new KeyAssignment( - ImmutableList.of(0, 1, 2, 3), ImmutableList.of(98L, 98L, 98L, 76L), 5L), - SORT_KEYS[1], - new KeyAssignment( - ImmutableList.of(3, 4, 5, 6), ImmutableList.of(22L, 98L, 98L, 27L), 5L), - SORT_KEYS[2], - new KeyAssignment(ImmutableList.of(6, 7), ImmutableList.of(71L, 59L), 5L), - SORT_KEYS[3], - new KeyAssignment(ImmutableList.of(7, 8), ImmutableList.of(39L, 6L), 5L), - SORT_KEYS[4], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), - SORT_KEYS[5], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), - SORT_KEYS[6], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), - SORT_KEYS[7], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), - SORT_KEYS[8], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), - SORT_KEYS[9], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L)); - assertThat(mapAssignment.keyAssignments()).isEqualTo(expectedAssignment); - - // key: subtask id - // value pair: first is the assigned weight for the subtask, second is the number of keys - // assigned to the subtask - Map> expectedAssignmentInfo = - ImmutableMap.of( - 0, - Pair.of(93L, 1), - 1, - Pair.of(93L, 1), - 2, - Pair.of(93L, 1), - 3, - Pair.of(88L, 2), - 4, - Pair.of(93L, 1), - 5, - Pair.of(93L, 1), - 6, - Pair.of(88L, 2), - 7, - Pair.of(88L, 2), - 8, - Pair.of(61L, 7)); - assertThat(mapAssignment.assignmentInfo()).isEqualTo(expectedAssignmentInfo); - - MapRangePartitioner partitioner = - new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapAssignment); - Map>> partitionResults = - runPartitioner(partitioner, numPartitions, mapStatistics); - // drift threshold is high for non-dividable scenario with close cost - validatePartitionResults(expectedAssignmentInfo, partitionResults, 10.0); - } - - private static Map>> runPartitioner( - MapRangePartitioner partitioner, int numPartitions, Map mapStatistics) { - // The Map key is the subtaskId. - // For the map value pair, the first element is the count of assigned and - // the second element of Set is for the set of assigned keys. - Map>> partitionResults = Maps.newHashMap(); - mapStatistics.forEach( - (sortKey, weight) -> { - String key = sortKey.get(0, String.class); - // run 100x times of the weight - long iterations = weight * 100; - for (int i = 0; i < iterations; ++i) { - RowData rowData = - GenericRowData.of( - StringData.fromString(key), 1, StringData.fromString("2023-06-20")); - int subtaskId = partitioner.partition(rowData, numPartitions); - partitionResults.computeIfAbsent( - subtaskId, k -> Pair.of(new AtomicLong(0), Sets.newHashSet())); - Pair> pair = partitionResults.get(subtaskId); - pair.first().incrementAndGet(); - pair.second().add(rowData); - } - }); - return partitionResults; - } - - /** - * @param expectedAssignmentInfo excluding closing cost - */ - private void validatePartitionResults( - Map> expectedAssignmentInfo, - Map>> partitionResults, - double maxDriftPercentage) { - - assertThat(partitionResults).hasSameSizeAs(expectedAssignmentInfo); - - List expectedAssignedKeyCounts = - Lists.newArrayListWithExpectedSize(expectedAssignmentInfo.size()); - List actualAssignedKeyCounts = - Lists.newArrayListWithExpectedSize(partitionResults.size()); - List expectedNormalizedWeights = - Lists.newArrayListWithExpectedSize(expectedAssignmentInfo.size()); - List actualNormalizedWeights = - Lists.newArrayListWithExpectedSize(partitionResults.size()); - - long expectedTotalWeight = - expectedAssignmentInfo.values().stream().mapToLong(Pair::first).sum(); - expectedAssignmentInfo.forEach( - (subtaskId, pair) -> { - expectedAssignedKeyCounts.add(pair.second()); - expectedNormalizedWeights.add(pair.first().doubleValue() / expectedTotalWeight); - }); - - long actualTotalWeight = - partitionResults.values().stream().mapToLong(pair -> pair.first().longValue()).sum(); - partitionResults.forEach( - (subtaskId, pair) -> { - actualAssignedKeyCounts.add(pair.second().size()); - actualNormalizedWeights.add(pair.first().doubleValue() / actualTotalWeight); - }); - - // number of assigned keys should match exactly - assertThat(actualAssignedKeyCounts) - .as("the number of assigned keys should match for every subtask") - .isEqualTo(expectedAssignedKeyCounts); - - // weight for every subtask shouldn't differ for more than some threshold relative to the - // expected weight - for (int subtaskId = 0; subtaskId < expectedNormalizedWeights.size(); ++subtaskId) { - double expectedWeight = expectedNormalizedWeights.get(subtaskId); - double min = expectedWeight * (1 - maxDriftPercentage / 100); - double max = expectedWeight * (1 + maxDriftPercentage / 100); - assertThat(actualNormalizedWeights.get(subtaskId)) - .as( - "Subtask %d weight should within %.1f percent of the expected range %s", - subtaskId, maxDriftPercentage, expectedWeight) - .isBetween(min, max); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java deleted file mode 100644 index 0485fdb7fa04..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SCHEMA; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER; -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.Set; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.junit.jupiter.api.Test; - -public class TestRangePartitioner { - private final int numPartitions = 4; - - @Test - public void testRoundRobinRecordsBeforeStatisticsAvailable() { - RangePartitioner partitioner = new RangePartitioner(SCHEMA, SORT_ORDER); - Set results = Sets.newHashSetWithExpectedSize(numPartitions); - for (int i = 0; i < numPartitions; ++i) { - results.add( - partitioner.partition( - StatisticsOrRecord.fromRecord(GenericRowData.of(StringData.fromString("a"), 1)), - numPartitions)); - } - - // round-robin. every partition should get an assignment - assertThat(results).containsExactlyInAnyOrder(0, 1, 2, 3); - } - - @Test - public void testRoundRobinStatisticsWrapper() { - RangePartitioner partitioner = new RangePartitioner(SCHEMA, SORT_ORDER); - Set results = Sets.newHashSetWithExpectedSize(numPartitions); - for (int i = 0; i < numPartitions; ++i) { - GlobalStatistics statistics = - GlobalStatistics.fromRangeBounds(1L, new SortKey[] {CHAR_KEYS.get("a")}); - results.add( - partitioner.partition(StatisticsOrRecord.fromStatistics(statistics), numPartitions)); - } - - // round-robin. every partition should get an assignment - assertThat(results).containsExactlyInAnyOrder(0, 1, 2, 3); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitionerSkew.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitionerSkew.java deleted file mode 100644 index d6d8aebc6350..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitionerSkew.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static java.lang.String.format; -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.Arrays; -import java.util.Comparator; -import java.util.DoubleSummaryStatistics; -import java.util.IntSummaryStatistics; -import java.util.List; -import java.util.Map; -import java.util.NavigableMap; -import java.util.UUID; -import java.util.concurrent.ThreadLocalRandom; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.SortOrderComparators; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.CsvSource; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class TestRangePartitionerSkew { - private static final Logger LOG = LoggerFactory.getLogger(TestRangePartitionerSkew.class); - - // change the iterations to a larger number (like 100) to see the statistics of max skew. - // like min, max, avg, stddev of max skew. - private static final int ITERATIONS = 1; - - /** - * @param parallelism number of partitions - * @param maxSkewUpperBound the upper bound of max skew. maxSkewUpperBound is set to a loose bound - * (~5x of the max value) to avoid flakiness. - *

    - *

  • Map parallelism 8: max skew statistics over 100 iterations: mean = 0.0124, min = - * 0.0046, max = 0.0213 - *
  • Map parallelism 32: max skew statistics over 100 iterations: mean = 0.0183, min = - * 0.0100, max = 0.0261 - */ - @ParameterizedTest - @CsvSource({"8, 100_000, 0.1", "32, 400_000, 0.15"}) - public void testMapStatisticsSkewWithLongTailDistribution( - int parallelism, int sampleSize, double maxSkewUpperBound) { - Schema schema = - new Schema(Types.NestedField.optional(1, "event_hour", Types.IntegerType.get())); - SortOrder sortOrder = SortOrder.builderFor(schema).asc("event_hour").build(); - Comparator comparator = SortOrderComparators.forSchema(schema, sortOrder); - SortKey sortKey = new SortKey(schema, sortOrder); - - NavigableMap weights = - DataDistributionUtil.longTailDistribution(100_000, 24, 240, 100, 2.0, 0.7); - Map mapStatistics = - DataDistributionUtil.mapStatisticsWithLongTailDistribution(weights, sortKey); - MapAssignment mapAssignment = - MapAssignment.fromKeyFrequency(parallelism, mapStatistics, 0.0, comparator); - MapRangePartitioner partitioner = new MapRangePartitioner(schema, sortOrder, mapAssignment); - - List keys = Lists.newArrayList(weights.keySet().iterator()); - long[] weightsCDF = DataDistributionUtil.computeCumulativeWeights(keys, weights); - long totalWeight = weightsCDF[weightsCDF.length - 1]; - - // change the iterations to a larger number (like 100) to see the statistics of max skew. - // like min, max, avg, stddev of max skew. - double[] maxSkews = new double[ITERATIONS]; - for (int iteration = 0; iteration < ITERATIONS; ++iteration) { - int[] recordsPerTask = new int[parallelism]; - for (int i = 0; i < sampleSize; ++i) { - // randomly pick a key according to the weight distribution - long weight = ThreadLocalRandom.current().nextLong(totalWeight); - int index = DataDistributionUtil.binarySearchIndex(weightsCDF, weight); - RowData row = GenericRowData.of(keys.get(index)); - int subtaskId = partitioner.partition(row, parallelism); - recordsPerTask[subtaskId] += 1; - } - - IntSummaryStatistics recordsPerTaskStats = Arrays.stream(recordsPerTask).summaryStatistics(); - LOG.debug("Map parallelism {}: records per task stats: {}", parallelism, recordsPerTaskStats); - double maxSkew = - (recordsPerTaskStats.getMax() - recordsPerTaskStats.getAverage()) - / recordsPerTaskStats.getAverage(); - LOG.debug("Map parallelism {}: max skew: {}", parallelism, format("%.03f", maxSkew)); - assertThat(maxSkew).isLessThan(maxSkewUpperBound); - maxSkews[iteration] = maxSkew; - } - - DoubleSummaryStatistics maxSkewStats = Arrays.stream(maxSkews).summaryStatistics(); - LOG.info( - "Map parallelism {}: max skew statistics over {} iterations: mean = {}, min = {}, max = {}", - parallelism, - ITERATIONS, - format("%.4f", maxSkewStats.getAverage()), - format("%.4f", maxSkewStats.getMin()), - format("%.4f", maxSkewStats.getMax())); - } - - /** - * @param parallelism number of partitions - * @param maxSkewUpperBound the upper bound of max skew. maxSkewUpperBound is set to a loose bound - * (~5x of the max value) to avoid flakiness. - *

    - *

  • pMap parallelism 8: max skew statistics over 100 iterations: mean = 0.0192, min = - * 0.0073, max = 0.0437 - *
  • Map parallelism 32: max skew statistics over 100 iterations: mean = 0.0426, min = - * 0.0262, max = 0.0613 - */ - @ParameterizedTest - @CsvSource({"8, 100_000, 0.20", "32, 400_000, 0.25"}) - public void testSketchStatisticsSkewWithLongTailDistribution( - int parallelism, int sampleSize, double maxSkewUpperBound) { - Schema schema = new Schema(Types.NestedField.optional(1, "uuid", Types.UUIDType.get())); - SortOrder sortOrder = SortOrder.builderFor(schema).asc("uuid").build(); - SortKey sortKey = new SortKey(schema, sortOrder); - - UUID[] reservoir = DataDistributionUtil.reservoirSampleUUIDs(1_000_000, 100_000); - UUID[] rangeBound = DataDistributionUtil.rangeBoundSampleUUIDs(reservoir, parallelism); - SortKey[] rangeBoundSortKeys = - Arrays.stream(rangeBound) - .map( - uuid -> { - SortKey sortKeyCopy = sortKey.copy(); - sortKeyCopy.set(0, uuid); - return sortKeyCopy; - }) - .toArray(SortKey[]::new); - - SketchRangePartitioner partitioner = - new SketchRangePartitioner(schema, sortOrder, rangeBoundSortKeys); - - double[] maxSkews = new double[ITERATIONS]; - for (int iteration = 0; iteration < ITERATIONS; ++iteration) { - int[] recordsPerTask = new int[parallelism]; - for (int i = 0; i < sampleSize; ++i) { - UUID uuid = UUID.randomUUID(); - Object uuidBytes = DataDistributionUtil.uuidBytes(uuid); - RowData row = GenericRowData.of(uuidBytes); - int subtaskId = partitioner.partition(row, parallelism); - recordsPerTask[subtaskId] += 1; - } - - IntSummaryStatistics recordsPerTaskStats = Arrays.stream(recordsPerTask).summaryStatistics(); - LOG.debug("Map parallelism {}: records per task stats: {}", parallelism, recordsPerTaskStats); - double maxSkew = - (recordsPerTaskStats.getMax() - recordsPerTaskStats.getAverage()) - / recordsPerTaskStats.getAverage(); - LOG.debug("Map parallelism {}: max skew: {}", parallelism, format("%.03f", maxSkew)); - assertThat(maxSkew).isLessThan(maxSkewUpperBound); - maxSkews[iteration] = maxSkew; - } - - DoubleSummaryStatistics maxSkewStats = Arrays.stream(maxSkews).summaryStatistics(); - LOG.info( - "Map parallelism {}: max skew statistics over {} iterations: mean = {}, min = {}, max = {}", - parallelism, - ITERATIONS, - format("%.4f", maxSkewStats.getAverage()), - format("%.4f", maxSkewStats.getMin()), - format("%.4f", maxSkewStats.getMax())); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java deleted file mode 100644 index 396bfae2f13c..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.ROW_WRAPPER; -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.datasketches.sampling.ReservoirItemsSketch; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.SortKey; -import org.junit.jupiter.api.Test; - -public class TestSketchDataStatistics { - @SuppressWarnings("unchecked") - @Test - public void testAddsAndGet() { - SketchDataStatistics dataStatistics = new SketchDataStatistics(128); - - GenericRowData reusedRow = GenericRowData.of(StringData.fromString("a"), 1); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - reusedRow.setField(0, StringData.fromString("b")); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - reusedRow.setField(0, StringData.fromString("c")); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - reusedRow.setField(0, StringData.fromString("b")); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - ReservoirItemsSketch actual = (ReservoirItemsSketch) dataStatistics.result(); - assertThat(actual.getSamples()) - .isEqualTo( - new SortKey[] { - CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("c"), CHAR_KEYS.get("b") - }); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java deleted file mode 100644 index 378c6afff077..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.TestFixtures; -import org.junit.jupiter.api.Test; - -public class TestSketchRangePartitioner { - // sort on the long id field - private static final SortOrder SORT_ORDER = - SortOrder.builderFor(TestFixtures.SCHEMA).asc("id").build(); - private static final SortKey SORT_KEY = new SortKey(TestFixtures.SCHEMA, SORT_ORDER); - private static final RowType ROW_TYPE = FlinkSchemaUtil.convert(TestFixtures.SCHEMA); - private static final int NUM_PARTITIONS = 16; - private static final long RANGE_STEP = 1_000; - private static final long MAX_ID = RANGE_STEP * NUM_PARTITIONS; - private static final SortKey[] RANGE_BOUNDS = createRangeBounds(); - - /** - * To understand how range bounds are used in range partitioning, here is an example for human - * ages with 4 partitions: [15, 32, 60]. The 4 ranges would be - * - *
      - *
    • age <= 15 - *
    • age > 15 && age <= 32 - *
    • age >32 && age <= 60 - *
    • age > 60 - *
    - */ - private static SortKey[] createRangeBounds() { - SortKey[] rangeBounds = new SortKey[NUM_PARTITIONS - 1]; - for (int i = 0; i < NUM_PARTITIONS - 1; ++i) { - RowData rowData = - GenericRowData.of( - StringData.fromString("data"), - RANGE_STEP * (i + 1), - StringData.fromString("2023-06-20")); - RowDataWrapper keyWrapper = new RowDataWrapper(ROW_TYPE, TestFixtures.SCHEMA.asStruct()); - keyWrapper.wrap(rowData); - SortKey sortKey = new SortKey(TestFixtures.SCHEMA, SORT_ORDER); - sortKey.wrap(keyWrapper); - rangeBounds[i] = sortKey; - } - - return rangeBounds; - } - - @Test - public void testRangePartitioningWithRangeBounds() { - SketchRangePartitioner partitioner = - new SketchRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, RANGE_BOUNDS); - GenericRowData row = - GenericRowData.of(StringData.fromString("data"), 0L, StringData.fromString("2023-06-20")); - for (long id = 0; id < MAX_ID; ++id) { - row.setField(1, id); - int partition = partitioner.partition(row, NUM_PARTITIONS); - assertThat(partition).isGreaterThanOrEqualTo(0).isLessThan(NUM_PARTITIONS); - int expectedPartition = id == 0L ? 0 : (int) ((id - 1) / RANGE_STEP); - assertThat(partition).isEqualTo(expectedPartition); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java deleted file mode 100644 index a0f660a965ef..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.iceberg.SortKey; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.ValueSource; - -public class TestSketchUtil { - @Test - public void testCoordinatorReservoirSize() { - // adjusted to over min threshold of 10_000 and is divisible by number of partitions (3) - assertThat(SketchUtil.determineCoordinatorReservoirSize(3)).isEqualTo(10_002); - // adjust to multiplier of 100 - assertThat(SketchUtil.determineCoordinatorReservoirSize(123)).isEqualTo(123_00); - // adjusted to below max threshold of 1_000_000 and is divisible by number of partitions (3) - assertThat(SketchUtil.determineCoordinatorReservoirSize(10_123)) - .isEqualTo(1_000_000 - (1_000_000 % 10_123)); - } - - @Test - public void testOperatorReservoirSize() { - assertThat(SketchUtil.determineOperatorReservoirSize(5, 3)) - .isEqualTo((10_002 * SketchUtil.OPERATOR_OVER_SAMPLE_RATIO) / 5); - assertThat(SketchUtil.determineOperatorReservoirSize(123, 123)) - .isEqualTo((123_00 * SketchUtil.OPERATOR_OVER_SAMPLE_RATIO) / 123); - assertThat(SketchUtil.determineOperatorReservoirSize(256, 123)) - .isEqualTo( - (int) Math.ceil((double) (123_00 * SketchUtil.OPERATOR_OVER_SAMPLE_RATIO) / 256)); - assertThat(SketchUtil.determineOperatorReservoirSize(5_120, 10_123)) - .isEqualTo( - (int) Math.ceil((double) (992_054 * SketchUtil.OPERATOR_OVER_SAMPLE_RATIO) / 5_120)); - } - - @Test - public void testRangeBoundsOneChannel() { - assertThat( - SketchUtil.rangeBounds( - 1, - SORT_ORDER_COMPARTOR, - new SortKey[] { - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("d"), - CHAR_KEYS.get("e"), - CHAR_KEYS.get("f") - })) - .isEmpty(); - } - - @Test - public void testRangeBoundsDivisible() { - assertThat( - SketchUtil.rangeBounds( - 3, - SORT_ORDER_COMPARTOR, - new SortKey[] { - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("d"), - CHAR_KEYS.get("e"), - CHAR_KEYS.get("f") - })) - .containsExactly(CHAR_KEYS.get("b"), CHAR_KEYS.get("d")); - } - - @Test - public void testRangeBoundsNonDivisible() { - // step is 3 = ceiling(11/4) - assertThat( - SketchUtil.rangeBounds( - 4, - SORT_ORDER_COMPARTOR, - new SortKey[] { - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("d"), - CHAR_KEYS.get("e"), - CHAR_KEYS.get("f"), - CHAR_KEYS.get("g"), - CHAR_KEYS.get("h"), - CHAR_KEYS.get("i"), - CHAR_KEYS.get("j"), - CHAR_KEYS.get("k"), - })) - .containsExactly(CHAR_KEYS.get("c"), CHAR_KEYS.get("f"), CHAR_KEYS.get("i")); - } - - @Test - public void testRangeBoundsSkipDuplicates() { - // step is 3 = ceiling(11/4) - assertThat( - SketchUtil.rangeBounds( - 4, - SORT_ORDER_COMPARTOR, - new SortKey[] { - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("g"), - CHAR_KEYS.get("h"), - CHAR_KEYS.get("i"), - CHAR_KEYS.get("j"), - CHAR_KEYS.get("k"), - })) - // skipped duplicate c's - .containsExactly(CHAR_KEYS.get("c"), CHAR_KEYS.get("g"), CHAR_KEYS.get("j")); - } - - @Test - public void testRangeBoundsNumPartitionsBiggerThanSortKeyCount() { - assertThat( - SketchUtil.rangeBounds( - 5, - SORT_ORDER_COMPARTOR, - new SortKey[] {CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("c")})) - .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("c")) - .doesNotContainNull(); - } - - @ParameterizedTest - @ValueSource(ints = {4, 6}) - public void testPartitioningAndScaleUp(int numPartitions) { - // Range bounds are calculated based on 4 partitions - SortKey[] rangeBounds = - new SortKey[] {CHAR_KEYS.get("c"), CHAR_KEYS.get("j"), CHAR_KEYS.get("m")}; - - // <= c - assertPartition(0, CHAR_KEYS.get("a"), numPartitions, rangeBounds); - assertPartition(0, CHAR_KEYS.get("c"), numPartitions, rangeBounds); - // > c && <= j - assertPartition(1, CHAR_KEYS.get("d"), numPartitions, rangeBounds); - assertPartition(1, CHAR_KEYS.get("i"), numPartitions, rangeBounds); - assertPartition(1, CHAR_KEYS.get("j"), numPartitions, rangeBounds); - // > j && <= m - assertPartition(2, CHAR_KEYS.get("k"), numPartitions, rangeBounds); - assertPartition(2, CHAR_KEYS.get("l"), numPartitions, rangeBounds); - assertPartition(2, CHAR_KEYS.get("m"), numPartitions, rangeBounds); - // > m - assertPartition(3, CHAR_KEYS.get("n"), numPartitions, rangeBounds); - assertPartition(3, CHAR_KEYS.get("z"), numPartitions, rangeBounds); - } - - @Test - public void testPartitionScaleDown() { - // Range bounds are calculated based on 4 partitions - SortKey[] rangeBounds = - new SortKey[] {CHAR_KEYS.get("c"), CHAR_KEYS.get("j"), CHAR_KEYS.get("m")}; - int numPartitions = 3; - - // <= c - assertPartition(0, CHAR_KEYS.get("a"), numPartitions, rangeBounds); - assertPartition(0, CHAR_KEYS.get("c"), numPartitions, rangeBounds); - // > c && <= j - assertPartition(1, CHAR_KEYS.get("d"), numPartitions, rangeBounds); - assertPartition(1, CHAR_KEYS.get("i"), numPartitions, rangeBounds); - assertPartition(1, CHAR_KEYS.get("j"), numPartitions, rangeBounds); - // > j && <= m - assertPartition(2, CHAR_KEYS.get("k"), numPartitions, rangeBounds); - assertPartition(2, CHAR_KEYS.get("l"), numPartitions, rangeBounds); - assertPartition(2, CHAR_KEYS.get("m"), numPartitions, rangeBounds); - // > m - // reassigns out-of-range partitions via mod (% 3 in this case) - assertPartition(0, CHAR_KEYS.get("n"), numPartitions, rangeBounds); - assertPartition(0, CHAR_KEYS.get("z"), numPartitions, rangeBounds); - } - - private static void assertPartition( - int expectedPartition, SortKey key, int numPartitions, SortKey[] rangeBounds) { - assertThat(SketchUtil.partition(key, numPartitions, rangeBounds, SORT_ORDER_COMPARTOR)) - .isEqualTo(expectedPartition); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java deleted file mode 100644 index c7fea015142c..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import org.apache.flink.api.common.typeutils.SerializerTestBase; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.table.data.GenericRowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; - -public abstract class TestSortKeySerializerBase extends SerializerTestBase { - - protected abstract Schema schema(); - - protected abstract SortOrder sortOrder(); - - protected abstract GenericRowData rowData(); - - @Override - protected TypeSerializer createSerializer() { - return new SortKeySerializer(schema(), sortOrder()); - } - - @Override - protected int getLength() { - return -1; - } - - @Override - protected Class getTypeClass() { - return SortKey.class; - } - - @Override - protected SortKey[] getTestData() { - return new SortKey[] {sortKey()}; - } - - private SortKey sortKey() { - RowDataWrapper rowDataWrapper = - new RowDataWrapper(FlinkSchemaUtil.convert(schema()), schema().asStruct()); - SortKey sortKey = new SortKey(schema(), sortOrder()); - sortKey.wrap(rowDataWrapper.wrap(rowData())); - return sortKey; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java deleted file mode 100644 index 0000688a8b55..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import org.apache.flink.table.data.GenericRowData; -import org.apache.iceberg.NullOrder; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortDirection; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.DataGenerator; -import org.apache.iceberg.flink.DataGenerators; - -public class TestSortKeySerializerNestedStruct extends TestSortKeySerializerBase { - private final DataGenerator generator = new DataGenerators.StructOfStruct(); - - @Override - protected Schema schema() { - return generator.icebergSchema(); - } - - @Override - protected SortOrder sortOrder() { - return SortOrder.builderFor(schema()) - .asc("row_id") - .sortBy( - Expressions.bucket("struct_of_struct.id", 4), SortDirection.DESC, NullOrder.NULLS_LAST) - .sortBy( - Expressions.truncate("struct_of_struct.person_struct.name", 16), - SortDirection.ASC, - NullOrder.NULLS_FIRST) - .build(); - } - - @Override - protected GenericRowData rowData() { - return generator.generateFlinkRowData(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java deleted file mode 100644 index ac2e2784e681..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.assertj.core.api.AssertionsForClassTypes.assertThat; - -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.NullOrder; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortDirection; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.DataGenerator; -import org.apache.iceberg.flink.DataGenerators; -import org.apache.iceberg.flink.RowDataWrapper; -import org.junit.jupiter.api.Test; - -public class TestSortKeySerializerPrimitives extends TestSortKeySerializerBase { - private final DataGenerator generator = new DataGenerators.Primitives(); - - @Override - protected Schema schema() { - return generator.icebergSchema(); - } - - @Override - protected SortOrder sortOrder() { - return SortOrder.builderFor(schema()) - .asc("boolean_field") - .sortBy(Expressions.bucket("int_field", 4), SortDirection.DESC, NullOrder.NULLS_LAST) - .sortBy(Expressions.truncate("string_field", 2), SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy(Expressions.bucket("uuid_field", 16), SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy(Expressions.hour("ts_with_zone_field"), SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy(Expressions.day("ts_without_zone_field"), SortDirection.ASC, NullOrder.NULLS_FIRST) - // can not test HeapByteBuffer due to equality test inside SerializerTestBase - // .sortBy(Expressions.truncate("binary_field", 2), SortDirection.ASC, - // NullOrder.NULLS_FIRST) - .build(); - } - - @Override - protected GenericRowData rowData() { - return generator.generateFlinkRowData(); - } - - @Test - public void testSerializationSize() throws Exception { - RowData rowData = - GenericRowData.of(StringData.fromString("550e8400-e29b-41d4-a716-446655440000"), 1L); - RowDataWrapper rowDataWrapper = - new RowDataWrapper(Fixtures.ROW_TYPE, Fixtures.SCHEMA.asStruct()); - StructLike struct = rowDataWrapper.wrap(rowData); - SortKey sortKey = Fixtures.SORT_KEY.copy(); - sortKey.wrap(struct); - SortKeySerializer serializer = new SortKeySerializer(Fixtures.SCHEMA, Fixtures.SORT_ORDER); - DataOutputSerializer output = new DataOutputSerializer(1024); - serializer.serialize(sortKey, output); - byte[] serializedBytes = output.getCopyOfBuffer(); - assertThat(serializedBytes.length) - .as( - "Serialized bytes for sort key should be 39 bytes (34 UUID text + 4 byte integer of string length + 1 byte of isnull flag") - .isEqualTo(39); - - DataInputDeserializer input = new DataInputDeserializer(serializedBytes); - SortKey deserialized = serializer.deserialize(input); - assertThat(deserialized).isEqualTo(sortKey); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java deleted file mode 100644 index 2d87b089cecb..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.ROW_TYPE; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SCHEMA; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_KEY; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER; -import static org.assertj.core.api.AssertionsForClassTypes.assertThat; - -import java.io.IOException; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.api.common.typeutils.TypeSerializerSchemaCompatibility; -import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataInputView; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; - -public class TestSortKeySerializerSnapshot { - private final Schema schema = - new Schema( - Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(2, "str", Types.StringType.get()), - Types.NestedField.optional(3, "int", Types.IntegerType.get()), - Types.NestedField.optional(4, "boolean", Types.BooleanType.get())); - private final SortOrder sortOrder = SortOrder.builderFor(schema).asc("str").asc("int").build(); - - @Test - public void testRestoredSerializer() throws Exception { - RowData rowData = GenericRowData.of(StringData.fromString("str"), 1); - RowDataWrapper rowDataWrapper = new RowDataWrapper(ROW_TYPE, SCHEMA.asStruct()); - StructLike struct = rowDataWrapper.wrap(rowData); - SortKey sortKey = SORT_KEY.copy(); - sortKey.wrap(struct); - - SortKeySerializer originalSerializer = new SortKeySerializer(SCHEMA, SORT_ORDER); - TypeSerializerSnapshot snapshot = - roundTrip(originalSerializer.snapshotConfiguration()); - TypeSerializer restoredSerializer = snapshot.restoreSerializer(); - - DataOutputSerializer output = new DataOutputSerializer(1024); - originalSerializer.serialize(sortKey, output); - byte[] serializedBytes = output.getCopyOfBuffer(); - - DataInputDeserializer input = new DataInputDeserializer(serializedBytes); - SortKey deserialized = restoredSerializer.deserialize(input); - assertThat(deserialized).isEqualTo(sortKey); - } - - @Test - public void testRestoredOldSerializer() throws Exception { - RowData rowData = GenericRowData.of(StringData.fromString("str"), 1); - RowDataWrapper rowDataWrapper = new RowDataWrapper(ROW_TYPE, SCHEMA.asStruct()); - StructLike struct = rowDataWrapper.wrap(rowData); - SortKey sortKey = SORT_KEY.copy(); - sortKey.wrap(struct); - - SortKeySerializer originalSerializer = new SortKeySerializer(SCHEMA, SORT_ORDER, 1); - TypeSerializerSnapshot snapshot = - roundTrip(originalSerializer.snapshotConfiguration()); - TypeSerializer restoredSerializer = snapshot.restoreSerializer(); - ((SortKeySerializer) restoredSerializer).setVersion(1); - DataOutputSerializer output = new DataOutputSerializer(1024); - originalSerializer.serialize(sortKey, output); - byte[] serializedBytes = output.getCopyOfBuffer(); - - DataInputDeserializer input = new DataInputDeserializer(serializedBytes); - SortKey deserialized = restoredSerializer.deserialize(input); - assertThat(deserialized).isEqualTo(sortKey); - } - - @Test - public void testSnapshotIsCompatibleWithSameSortOrder() throws Exception { - SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = - new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); - - SortKeySerializer.SortKeySerializerSnapshot newSnapshot = - roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder)); - - TypeSerializerSchemaCompatibility resultCompatibility = - newSnapshot.resolveSchemaCompatibility(oldSnapshot); - assertThat(resultCompatibility.isCompatibleAsIs()).isTrue(); - } - - @Test - public void testSnapshotIsCompatibleWithRemoveNonSortField() throws Exception { - SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = - new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); - - // removed non-sort boolean field - Schema newSchema = - new Schema( - Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(2, "str", Types.StringType.get()), - Types.NestedField.optional(3, "int", Types.IntegerType.get())); - SortOrder newSortOrder = SortOrder.builderFor(newSchema).asc("str").asc("int").build(); - SortKeySerializer.SortKeySerializerSnapshot newSnapshot = - roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(newSchema, newSortOrder)); - - TypeSerializerSchemaCompatibility resultCompatibility = - newSnapshot.resolveSchemaCompatibility(oldSnapshot); - assertThat(resultCompatibility.isCompatibleAsIs()).isTrue(); - } - - @Test - public void testSnapshotIsCompatibleWithAddNonSortField() throws Exception { - SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = - new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); - - // add a new non-sort float field - Schema newSchema = - new Schema( - Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(2, "str", Types.StringType.get()), - Types.NestedField.optional(3, "int", Types.IntegerType.get()), - Types.NestedField.optional(4, "boolean", Types.BooleanType.get()), - Types.NestedField.required(5, "float", Types.FloatType.get())); - SortOrder newSortOrder = SortOrder.builderFor(newSchema).asc("str").asc("int").build(); - SortKeySerializer.SortKeySerializerSnapshot newSnapshot = - roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(newSchema, newSortOrder)); - - TypeSerializerSchemaCompatibility resultCompatibility = - newSnapshot.resolveSchemaCompatibility(oldSnapshot); - assertThat(resultCompatibility.isCompatibleAsIs()).isTrue(); - } - - @Test - public void testSnapshotIsIncompatibleWithIncompatibleSchema() throws Exception { - SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = - new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); - - // change str field to a long type - Schema newSchema = - new Schema( - Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(2, "str", Types.LongType.get()), - Types.NestedField.optional(3, "int", Types.IntegerType.get()), - Types.NestedField.optional(4, "boolean", Types.BooleanType.get())); - SortOrder newSortOrder = SortOrder.builderFor(newSchema).asc("str").asc("int").build(); - // switch sort field order - SortKeySerializer.SortKeySerializerSnapshot newSnapshot = - roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(newSchema, newSortOrder)); - - TypeSerializerSchemaCompatibility resultCompatibility = - newSnapshot.resolveSchemaCompatibility(oldSnapshot); - assertThat(resultCompatibility.isIncompatible()).isTrue(); - } - - @Test - public void testSnapshotIsIncompatibleWithAddSortField() throws Exception { - SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = - new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); - - // removed str field from sort order - SortOrder newSortOrder = - SortOrder.builderFor(schema).asc("str").asc("int").desc("boolean").build(); - SortKeySerializer.SortKeySerializerSnapshot newSnapshot = - roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(schema, newSortOrder)); - - TypeSerializerSchemaCompatibility resultCompatibility = - newSnapshot.resolveSchemaCompatibility(oldSnapshot); - assertThat(resultCompatibility.isIncompatible()).isTrue(); - } - - @Test - public void testSnapshotIsIncompatibleWithRemoveSortField() throws Exception { - SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = - new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); - - // remove str field from sort order - SortOrder newSortOrder = SortOrder.builderFor(schema).asc("int").build(); - SortKeySerializer.SortKeySerializerSnapshot newSnapshot = - roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(schema, newSortOrder)); - - TypeSerializerSchemaCompatibility resultCompatibility = - newSnapshot.resolveSchemaCompatibility(oldSnapshot); - assertThat(resultCompatibility.isIncompatible()).isTrue(); - } - - @Test - public void testSnapshotIsIncompatibleWithSortFieldsOrderChange() throws Exception { - SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = - new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); - - // switch sort field order - SortOrder newSortOrder = SortOrder.builderFor(schema).asc("int").asc("str").build(); - SortKeySerializer.SortKeySerializerSnapshot newSnapshot = - roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(schema, newSortOrder)); - - TypeSerializerSchemaCompatibility resultCompatibility = - newSnapshot.resolveSchemaCompatibility(oldSnapshot); - assertThat(resultCompatibility.isIncompatible()).isTrue(); - } - - /** Copied from Flink {@code AvroSerializerSnapshotTest} */ - private static SortKeySerializer.SortKeySerializerSnapshot roundTrip( - TypeSerializerSnapshot original) throws IOException { - // writeSnapshot(); - DataOutputSerializer out = new DataOutputSerializer(1024); - original.writeSnapshot(out); - // init - SortKeySerializer.SortKeySerializerSnapshot restored = - new SortKeySerializer.SortKeySerializerSnapshot(); - // readSnapshot(); - DataInputView in = new DataInputDeserializer(out.wrapAsByteBuffer()); - restored.readSnapshot(restored.getCurrentVersion(), in, original.getClass().getClassLoader()); - return restored; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java deleted file mode 100644 index 1be7e27f2c01..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.iceberg.NullOrder; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortDirection; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; - -public class TestSortKeyUtil { - @Test - public void testResultSchema() { - Schema schema = - new Schema( - Types.NestedField.required(1, "id", Types.StringType.get()), - Types.NestedField.required(2, "ratio", Types.DoubleType.get()), - Types.NestedField.optional( - 3, - "user", - Types.StructType.of( - Types.NestedField.required(11, "name", Types.StringType.get()), - Types.NestedField.required(12, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(13, "device_id", Types.UUIDType.get()), - Types.NestedField.optional( - 14, - "location", - Types.StructType.of( - Types.NestedField.required(101, "lat", Types.FloatType.get()), - Types.NestedField.required(102, "long", Types.FloatType.get()), - Types.NestedField.required(103, "blob", Types.BinaryType.get())))))); - - SortOrder sortOrder = - SortOrder.builderFor(schema) - .asc("ratio") - .sortBy(Expressions.hour("user.ts"), SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy( - Expressions.bucket("user.device_id", 16), SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy( - Expressions.truncate("user.location.blob", 16), - SortDirection.ASC, - NullOrder.NULLS_FIRST) - .build(); - - assertThat(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()) - .isEqualTo( - Types.StructType.of( - Types.NestedField.required(0, "ratio_0", Types.DoubleType.get()), - Types.NestedField.required(1, "ts_1", Types.IntegerType.get()), - Types.NestedField.optional(2, "device_id_2", Types.IntegerType.get()), - Types.NestedField.required(3, "blob_3", Types.BinaryType.get()))); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestStatisticsOrRecordTypeInformation.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestStatisticsOrRecordTypeInformation.java deleted file mode 100644 index f54198522e99..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestStatisticsOrRecordTypeInformation.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import org.apache.flink.api.common.typeutils.TypeInformationTestBase; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.types.Types; - -public class TestStatisticsOrRecordTypeInformation - extends TypeInformationTestBase { - private static final Schema SCHEMA = - new Schema( - Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(2, "uuid", Types.UUIDType.get()), - Types.NestedField.optional(3, "data", Types.StringType.get())); - private static final RowType ROW_TYPE = FlinkSchemaUtil.convert(SCHEMA); - private static final SortOrder SORT_ORDER1 = SortOrder.builderFor(SCHEMA).asc("ts").build(); - private static final SortOrder SORT_ORDER2 = SortOrder.builderFor(SCHEMA).asc("data").build(); - - @Override - protected StatisticsOrRecordTypeInformation[] getTestData() { - return new StatisticsOrRecordTypeInformation[] { - new StatisticsOrRecordTypeInformation(ROW_TYPE, SCHEMA, SORT_ORDER1), - new StatisticsOrRecordTypeInformation(ROW_TYPE, SCHEMA, SORT_ORDER2), - }; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java deleted file mode 100644 index f97937dfef9c..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Supplier; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.java.typeutils.RowTypeInfo; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.functions.source.SourceFunction; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.connector.ChangelogMode; -import org.apache.flink.table.connector.ProviderContext; -import org.apache.flink.table.connector.source.DataStreamScanProvider; -import org.apache.flink.table.connector.source.DynamicTableSource; -import org.apache.flink.table.connector.source.ScanTableSource; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.util.DataFormatConverters; -import org.apache.flink.table.factories.DynamicTableSourceFactory; -import org.apache.flink.table.runtime.typeutils.ExternalTypeInfo; -import org.apache.flink.table.types.DataType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.Row; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -public class BoundedTableFactory implements DynamicTableSourceFactory { - private static final AtomicInteger DATA_SET_ID = new AtomicInteger(0); - private static final Map>> DATA_SETS = Maps.newHashMap(); - - private static final ConfigOption DATA_ID = - ConfigOptions.key("data-id").stringType().noDefaultValue(); - - public static String registerDataSet(List> dataSet) { - String dataSetId = String.valueOf(DATA_SET_ID.incrementAndGet()); - DATA_SETS.put(dataSetId, dataSet); - return dataSetId; - } - - public static void clearDataSets() { - DATA_SETS.clear(); - } - - @Override - public DynamicTableSource createDynamicTableSource(Context context) { - ResolvedSchema resolvedSchema = - ResolvedSchema.of( - context.getCatalogTable().getResolvedSchema().getColumns().stream() - .filter(Column::isPhysical) - .collect(Collectors.toList())); - - Configuration configuration = Configuration.fromMap(context.getCatalogTable().getOptions()); - String dataId = configuration.get(DATA_ID); - Preconditions.checkArgument( - DATA_SETS.containsKey(dataId), "data-id %s does not found in registered data set.", dataId); - - return new BoundedTableSource(DATA_SETS.get(dataId), resolvedSchema); - } - - @Override - public String factoryIdentifier() { - return "BoundedSource"; - } - - @Override - public Set> requiredOptions() { - return ImmutableSet.of(); - } - - @Override - public Set> optionalOptions() { - return ImmutableSet.of(DATA_ID); - } - - private static class BoundedTableSource implements ScanTableSource { - - private final List> elementsPerCheckpoint; - private final ResolvedSchema resolvedSchema; - - private BoundedTableSource( - List> elementsPerCheckpoint, ResolvedSchema resolvedSchema) { - this.elementsPerCheckpoint = elementsPerCheckpoint; - this.resolvedSchema = resolvedSchema; - } - - private BoundedTableSource(BoundedTableSource toCopy) { - this.elementsPerCheckpoint = toCopy.elementsPerCheckpoint; - this.resolvedSchema = toCopy.resolvedSchema; - } - - @Override - public ChangelogMode getChangelogMode() { - Supplier> supplier = () -> elementsPerCheckpoint.stream().flatMap(List::stream); - - // Add the INSERT row kind by default. - ChangelogMode.Builder builder = ChangelogMode.newBuilder().addContainedKind(RowKind.INSERT); - - if (supplier.get().anyMatch(r -> r.getKind() == RowKind.DELETE)) { - builder.addContainedKind(RowKind.DELETE); - } - - if (supplier.get().anyMatch(r -> r.getKind() == RowKind.UPDATE_BEFORE)) { - builder.addContainedKind(RowKind.UPDATE_BEFORE); - } - - if (supplier.get().anyMatch(r -> r.getKind() == RowKind.UPDATE_AFTER)) { - builder.addContainedKind(RowKind.UPDATE_AFTER); - } - - return builder.build(); - } - - @Override - public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderContext) { - return new DataStreamScanProvider() { - @Override - public DataStream produceDataStream( - ProviderContext providerContext, StreamExecutionEnvironment env) { - boolean checkpointEnabled = env.getCheckpointConfig().isCheckpointingEnabled(); - SourceFunction source = - new BoundedTestSource<>(elementsPerCheckpoint, checkpointEnabled); - - RowType rowType = (RowType) resolvedSchema.toSourceRowDataType().getLogicalType(); - // Converter to convert the Row to RowData. - DataFormatConverters.RowConverter rowConverter = - new DataFormatConverters.RowConverter( - resolvedSchema.getColumnDataTypes().toArray(DataType[]::new)); - - return env.addSource( - source, - new RowTypeInfo( - resolvedSchema.getColumnDataTypes().stream() - .map(ExternalTypeInfo::of) - .toArray(TypeInformation[]::new))) - .map(rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)); - } - - @Override - public boolean isBounded() { - return true; - } - }; - } - - @Override - public DynamicTableSource copy() { - return new BoundedTableSource(this); - } - - @Override - public String asSummaryString() { - return "Bounded test table source"; - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java deleted file mode 100644 index 7b435d059845..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.atomic.AtomicInteger; -import org.apache.flink.api.common.state.CheckpointListener; -import org.apache.flink.streaming.api.functions.source.SourceFunction; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * A stream source that: 1) emits the elements from elementsPerCheckpoint.get(0) without allowing - * checkpoints. 2) then waits for the checkpoint to complete. 3) emits the elements from - * elementsPerCheckpoint.get(1) without allowing checkpoints. 4) then waits for the checkpoint to - * complete. 5) ... - * - *

    Util all the list from elementsPerCheckpoint are exhausted. - */ -public final class BoundedTestSource implements SourceFunction, CheckpointListener { - - private final List> elementsPerCheckpoint; - private final boolean checkpointEnabled; - private volatile boolean running = true; - - private final AtomicInteger numCheckpointsComplete = new AtomicInteger(0); - - /** Emits all those elements in several checkpoints. */ - public BoundedTestSource(List> elementsPerCheckpoint, boolean checkpointEnabled) { - this.elementsPerCheckpoint = elementsPerCheckpoint; - this.checkpointEnabled = checkpointEnabled; - } - - public BoundedTestSource(List> elementsPerCheckpoint) { - this(elementsPerCheckpoint, true); - } - - /** Emits all those elements in a single checkpoint. */ - public BoundedTestSource(T... elements) { - this(Collections.singletonList(Arrays.asList(elements))); - } - - @Override - public void run(SourceContext ctx) throws Exception { - if (!checkpointEnabled) { - Preconditions.checkArgument( - elementsPerCheckpoint.size() <= 1, - "There should be at most one list in the elementsPerCheckpoint when checkpoint is disabled."); - elementsPerCheckpoint.stream().flatMap(List::stream).forEach(ctx::collect); - return; - } - - for (List elements : elementsPerCheckpoint) { - - final int checkpointToAwait; - synchronized (ctx.getCheckpointLock()) { - // Let's say checkpointToAwait = numCheckpointsComplete.get() + delta, in fact the value of - // delta should not - // affect the final table records because we only need to make sure that there will be - // exactly - // elementsPerCheckpoint.size() checkpoints to emit each records buffer from the original - // elementsPerCheckpoint. - // Even if the checkpoints that emitted results are not continuous, the correctness of the - // data should not be - // affected in the end. Setting the delta to be 2 is introducing the variable that produce - // un-continuous - // checkpoints that emit the records buffer from elementsPerCheckpoints. - checkpointToAwait = numCheckpointsComplete.get() + 2; - for (T element : elements) { - ctx.collect(element); - } - } - - synchronized (ctx.getCheckpointLock()) { - while (running && numCheckpointsComplete.get() < checkpointToAwait) { - ctx.getCheckpointLock().wait(1); - } - } - } - } - - @Override - public void notifyCheckpointComplete(long checkpointId) throws Exception { - numCheckpointsComplete.incrementAndGet(); - } - - @Override - public void cancel() { - running = false; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java deleted file mode 100644 index 5dfbbe3abe73..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.types.Row; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.TestBase; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestInfo; - -public class ChangeLogTableTestBase extends TestBase { - private volatile TableEnvironment tEnv = null; - - protected String tableName; - - @BeforeEach - public void setup(TestInfo testInfo) { - assertThat(testInfo.getTestMethod()).isPresent(); - this.tableName = testInfo.getTestMethod().get().getName(); - } - - @AfterEach - public void clean() { - sql("DROP TABLE IF EXISTS %s", tableName); - BoundedTableFactory.clearDataSets(); - } - - @Override - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - EnvironmentSettings settings = - EnvironmentSettings.newInstance().inStreamingMode().build(); - - StreamExecutionEnvironment env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(400) - .setMaxParallelism(1) - .setParallelism(1); - - tEnv = StreamTableEnvironment.create(env, settings); - } - } - } - return tEnv; - } - - protected static Row insertRow(Object... values) { - return Row.ofKind(RowKind.INSERT, values); - } - - protected static Row deleteRow(Object... values) { - return Row.ofKind(RowKind.DELETE, values); - } - - protected static Row updateBeforeRow(Object... values) { - return Row.ofKind(RowKind.UPDATE_BEFORE, values); - } - - protected static Row updateAfterRow(Object... values) { - return Row.ofKind(RowKind.UPDATE_AFTER, values); - } - - protected static List listJoin(List> lists) { - return lists.stream().flatMap(List::stream).collect(Collectors.toList()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java deleted file mode 100644 index 540902f3cea5..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.mockito.Mockito.doReturn; -import static org.mockito.Mockito.spy; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.BaseCombinedScanTask; -import org.apache.iceberg.BaseFileScanTask; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.FileMetadata; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.PartitionSpecParser; -import org.apache.iceberg.SchemaParser; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.expressions.ResidualEvaluator; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.hadoop.HadoopCatalog; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.ThreadPools; - -public class SplitHelpers { - - private SplitHelpers() {} - - /** - * This create a list of IcebergSourceSplit from real files - *

  • Create a new Hadoop table under the {@code temporaryFolder} - *
  • write {@code fileCount} number of files to the new Iceberg table - *
  • Discover the splits from the table and partition the splits by the {@code filePerSplit} - * limit - *
  • Delete the Hadoop table - * - *

    Since the table and data files are deleted before this method return, caller shouldn't - * attempt to read the data files. - * - *

    By default, v1 Iceberg table is created. For v2 table use {@link - * SplitHelpers#createSplitsFromTransientHadoopTable(Path, int, int, String)} - * - * @param temporaryFolder Folder to place the data to - * @param fileCount The number of files to create and add to the table - * @param filesPerSplit The number of files used for a split - */ - public static List createSplitsFromTransientHadoopTable( - Path temporaryFolder, int fileCount, int filesPerSplit) throws Exception { - return createSplitsFromTransientHadoopTable(temporaryFolder, fileCount, filesPerSplit, "1"); - } - - /** - * This create a list of IcebergSourceSplit from real files - *

  • Create a new Hadoop table under the {@code temporaryFolder} - *
  • write {@code fileCount} number of files to the new Iceberg table - *
  • Discover the splits from the table and partition the splits by the {@code filePerSplit} - * limit - *
  • Delete the Hadoop table - * - *

    Since the table and data files are deleted before this method return, caller shouldn't - * attempt to read the data files. - * - * @param temporaryFolder Folder to place the data to - * @param fileCount The number of files to create and add to the table - * @param filesPerSplit The number of files used for a split - * @param version The table version to create - */ - public static List createSplitsFromTransientHadoopTable( - Path temporaryFolder, int fileCount, int filesPerSplit, String version) throws Exception { - final File warehouseFile = File.createTempFile("junit", null, temporaryFolder.toFile()); - assertThat(warehouseFile.delete()).isTrue(); - final String warehouse = "file:" + warehouseFile; - Configuration hadoopConf = new Configuration(); - final HadoopCatalog catalog = new HadoopCatalog(hadoopConf, warehouse); - ImmutableMap properties = - ImmutableMap.of(TableProperties.FORMAT_VERSION, version); - try { - final Table table = - catalog.createTable( - TestFixtures.TABLE_IDENTIFIER, - TestFixtures.SCHEMA, - PartitionSpec.unpartitioned(), - null, - properties); - final GenericAppenderHelper dataAppender = - new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); - for (int i = 0; i < fileCount; ++i) { - List records = RandomGenericData.generate(TestFixtures.SCHEMA, 2, i); - dataAppender.appendToTable(records); - } - - final ScanContext scanContext = ScanContext.builder().build(); - final List splits = - FlinkSplitPlanner.planIcebergSourceSplits( - table, scanContext, ThreadPools.getWorkerPool()); - return splits.stream() - .flatMap( - split -> { - List> filesList = - Lists.partition(Lists.newArrayList(split.task().files()), filesPerSplit); - return filesList.stream() - .map(files -> new BaseCombinedScanTask(files)) - .map( - combinedScanTask -> - IcebergSourceSplit.fromCombinedScanTask(combinedScanTask)); - }) - .collect(Collectors.toList()); - } finally { - catalog.dropTable(TestFixtures.TABLE_IDENTIFIER); - catalog.close(); - } - } - - /** - * This method will equip the {@code icebergSourceSplits} with mock delete files. - *

  • For each split, create {@code deleteFilesPerSplit} number of delete files - *
  • Replace the original {@code FileScanTask} with the new {@code FileScanTask} with mock - *
  • Caller should not attempt to read the deleted files since they are created as mock, and - * they are not real files - * - * @param icebergSourceSplits The real splits to equip with mock delete files - * @param temporaryFolder The temporary folder to create the mock delete files with - * @param deleteFilesPerSplit The number of delete files to create for each split - * @return The list of re-created splits with mock delete files - * @throws IOException If there is any error creating the mock delete files - */ - public static List equipSplitsWithMockDeleteFiles( - List icebergSourceSplits, Path temporaryFolder, int deleteFilesPerSplit) - throws IOException { - List icebergSourceSplitsWithMockDeleteFiles = Lists.newArrayList(); - for (IcebergSourceSplit split : icebergSourceSplits) { - final CombinedScanTask combinedScanTask = spy(split.task()); - - final List deleteFiles = Lists.newArrayList(); - final PartitionSpec spec = - PartitionSpec.builderFor(TestFixtures.SCHEMA).withSpecId(0).build(); - - for (int i = 0; i < deleteFilesPerSplit; ++i) { - final DeleteFile deleteFile = - FileMetadata.deleteFileBuilder(spec) - .withFormat(FileFormat.PARQUET) - .withPath(File.createTempFile("junit", null, temporaryFolder.toFile()).getPath()) - .ofPositionDeletes() - .withFileSizeInBytes(1000) - .withRecordCount(1000) - .build(); - deleteFiles.add(deleteFile); - } - - List newFileScanTasks = Lists.newArrayList(); - for (FileScanTask task : combinedScanTask.tasks()) { - String schemaString = SchemaParser.toJson(task.schema()); - String specString = PartitionSpecParser.toJson(task.spec()); - - BaseFileScanTask baseFileScanTask = - new BaseFileScanTask( - task.file(), - deleteFiles.toArray(new DeleteFile[] {}), - schemaString, - specString, - ResidualEvaluator.unpartitioned(task.residual())); - newFileScanTasks.add(baseFileScanTask); - } - doReturn(newFileScanTasks).when(combinedScanTask).tasks(); - icebergSourceSplitsWithMockDeleteFiles.add( - IcebergSourceSplit.fromCombinedScanTask( - combinedScanTask, split.fileOffset(), split.recordOffset())); - } - return icebergSourceSplitsWithMockDeleteFiles; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java deleted file mode 100644 index e4e48ca67f66..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.List; -import java.util.Map; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -public class SqlHelpers { - private SqlHelpers() {} - - public static List sql(TableEnvironment tableEnv, String query, Object... args) { - TableResult tableResult = tableEnv.executeSql(String.format(query, args)); - try (CloseableIterator iter = tableResult.collect()) { - List results = Lists.newArrayList(iter); - return results; - } catch (Exception e) { - throw new RuntimeException("Failed to collect table result", e); - } - } - - public static String sqlOptionsToString(Map sqlOptions) { - StringBuilder builder = new StringBuilder(); - sqlOptions.forEach((key, value) -> builder.append(optionToKv(key, value)).append(",")); - String optionStr = builder.toString(); - if (optionStr.endsWith(",")) { - optionStr = optionStr.substring(0, optionStr.length() - 1); - } - - if (!optionStr.isEmpty()) { - optionStr = String.format("/*+ OPTIONS(%s)*/", optionStr); - } - - return optionStr; - } - - private static String optionToKv(String key, Object value) { - return "'" + key + "'='" + value + "'"; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java deleted file mode 100644 index f89d63ac73e3..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.events.Listeners; -import org.apache.iceberg.events.ScanEvent; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.TestBase; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public abstract class TableSourceTestBase extends TestBase { - @Parameters(name = "useFlip27Source = {0}") - protected static Object[][] parameters() { - return new Object[][] { - {false}, {true}, - }; - } - - @Parameter(index = 0) - protected boolean useFlip27Source; - - protected static final String CATALOG_NAME = "test_catalog"; - protected static final String DATABASE_NAME = "test_db"; - protected static final String TABLE_NAME = "test_table"; - protected final FileFormat format = FileFormat.AVRO; - protected int scanEventCount = 0; - protected ScanEvent lastScanEvent = null; - - @Override - protected TableEnvironment getTableEnv() { - super.getTableEnv().getConfig().getConfiguration().set(CoreOptions.DEFAULT_PARALLELISM, 1); - super.getTableEnv() - .getConfig() - .getConfiguration() - .setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE.key(), useFlip27Source); - return super.getTableEnv(); - } - - @BeforeEach - public void before() throws IOException { - // register a scan event listener to validate pushdown - Listeners.register( - event -> { - scanEventCount += 1; - lastScanEvent = event; - }, - ScanEvent.class); - - File warehouseFile = File.createTempFile("junit", null, temporaryDirectory.toFile()); - assertThat(warehouseFile.delete()).isTrue(); - String warehouse = String.format("file:%s", warehouseFile); - - sql( - "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", - CATALOG_NAME, warehouse); - sql("USE CATALOG %s", CATALOG_NAME); - sql("CREATE DATABASE %s", DATABASE_NAME); - sql("USE %s", DATABASE_NAME); - sql( - "CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) WITH ('write.format.default'='%s')", - TABLE_NAME, format.name()); - sql( - "INSERT INTO %s VALUES (1,'iceberg',10),(2,'b',20),(3,CAST(NULL AS VARCHAR),30)", - TABLE_NAME); - - this.scanEventCount = 0; - this.lastScanEvent = null; - } - - @AfterEach - public void clean() { - sql("DROP TABLE IF EXISTS %s.%s", DATABASE_NAME, TABLE_NAME); - dropDatabase(DATABASE_NAME, true); - dropCatalog(CATALOG_NAME, true); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java deleted file mode 100644 index bde751e1f87f..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.List; -import java.util.Objects; -import java.util.stream.Collectors; -import org.apache.flink.types.Row; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Streams; -import org.junit.jupiter.api.Test; - -public class TestBoundedTableFactory extends ChangeLogTableTestBase { - - @Test - public void testEmptyDataSet() { - List> emptyDataSet = ImmutableList.of(); - - String dataId = BoundedTableFactory.registerDataSet(emptyDataSet); - sql( - "CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", - tableName, dataId); - - assertThat(sql("SELECT * FROM %s", tableName)).isEmpty(); - } - - @Test - public void testBoundedTableFactory() { - List> dataSet = - ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(1, "bbb"), - insertRow(2, "aaa"), - deleteRow(2, "aaa"), - insertRow(2, "bbb")), - ImmutableList.of( - updateBeforeRow(2, "bbb"), - updateAfterRow(2, "ccc"), - deleteRow(2, "ccc"), - insertRow(2, "ddd")), - ImmutableList.of( - deleteRow(1, "bbb"), - insertRow(1, "ccc"), - deleteRow(1, "ccc"), - insertRow(1, "ddd"))); - - String dataId = BoundedTableFactory.registerDataSet(dataSet); - sql( - "CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", - tableName, dataId); - - List rowSet = dataSet.stream().flatMap(Streams::stream).collect(Collectors.toList()); - assertThat(sql("SELECT * FROM %s", tableName)).isEqualTo(rowSet); - - assertThat(sql("SELECT * FROM %s WHERE data='aaa'", tableName)) - .isEqualTo( - rowSet.stream() - .filter(r -> Objects.equals(r.getField(1), "aaa")) - .collect(Collectors.toList())); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java deleted file mode 100644 index c8b65e131c33..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.flink.SimpleDataUtil.SCHEMA; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.Row; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.TestTemplate; - -/** Test {@link FlinkInputFormat}. */ -public class TestFlinkInputFormat extends TestFlinkSource { - - @Override - protected List run( - FlinkSource.Builder formatBuilder, - Map sqlOptions, - String sqlFilter, - String... sqlSelectedFields) - throws Exception { - return runFormat(formatBuilder.tableLoader(tableLoader()).buildFormat()); - } - - @TestTemplate - public void testNestedProjection() throws Exception { - Schema schema = - new Schema( - required(1, "data", Types.StringType.get()), - required( - 2, - "nested", - Types.StructType.of( - Types.NestedField.required(3, "f1", Types.StringType.get()), - Types.NestedField.required(4, "f2", Types.StringType.get()), - Types.NestedField.required(5, "f3", Types.LongType.get()))), - required(6, "id", Types.LongType.get())); - - Table table = - CATALOG_EXTENSION.catalog().createTable(TableIdentifier.of("default", "t"), schema); - - List writeRecords = RandomGenericData.generate(schema, 2, 0L); - new GenericAppenderHelper(table, fileFormat, temporaryDirectory).appendToTable(writeRecords); - - // Schema: [data, nested[f1, f2, f3], id] - // Projection: [nested.f2, data] - // The Flink SQL output: [f2, data] - // The FlinkInputFormat output: [nested[f2], data] - - TableSchema projectedSchema = - TableSchema.builder() - .field("nested", DataTypes.ROW(DataTypes.FIELD("f2", DataTypes.STRING()))) - .field("data", DataTypes.STRING()) - .build(); - List result = - runFormat( - FlinkSource.forRowData() - .tableLoader(tableLoader()) - .project(projectedSchema) - .buildFormat()); - - List expected = Lists.newArrayList(); - for (Record record : writeRecords) { - Row nested = Row.of(((Record) record.get(1)).get(1)); - expected.add(Row.of(nested, record.get(0))); - } - - TestHelpers.assertRows(result, expected); - } - - @TestTemplate - public void testBasicProjection() throws IOException { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(2, "time", Types.TimestampType.withZone())); - - Table table = - CATALOG_EXTENSION.catalog().createTable(TableIdentifier.of("default", "t"), writeSchema); - - List writeRecords = RandomGenericData.generate(writeSchema, 2, 0L); - new GenericAppenderHelper(table, fileFormat, temporaryDirectory).appendToTable(writeRecords); - - TableSchema projectedSchema = - TableSchema.builder() - .field("id", DataTypes.BIGINT()) - .field("data", DataTypes.STRING()) - .build(); - List result = - runFormat( - FlinkSource.forRowData() - .tableLoader(tableLoader()) - .project(projectedSchema) - .buildFormat()); - - List expected = Lists.newArrayList(); - for (Record record : writeRecords) { - expected.add(Row.of(record.get(0), record.get(1))); - } - - TestHelpers.assertRows(result, expected); - } - - @TestTemplate - public void testReadPartitionColumn() throws Exception { - assumeThat(fileFormat).as("Temporary skip ORC").isNotEqualTo(FileFormat.ORC); - - Schema nestedSchema = - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional( - 2, - "struct", - Types.StructType.of( - Types.NestedField.optional(3, "innerId", Types.LongType.get()), - Types.NestedField.optional(4, "innerName", Types.StringType.get())))); - PartitionSpec spec = - PartitionSpec.builderFor(nestedSchema).identity("struct.innerName").build(); - - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, nestedSchema, spec); - List records = RandomGenericData.generate(nestedSchema, 10, 0L); - GenericAppenderHelper appender = - new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - for (Record record : records) { - org.apache.iceberg.TestHelpers.Row partition = - org.apache.iceberg.TestHelpers.Row.of(record.get(1, Record.class).get(1)); - appender.appendToTable(partition, Collections.singletonList(record)); - } - - TableSchema projectedSchema = - TableSchema.builder() - .field("struct", DataTypes.ROW(DataTypes.FIELD("innerName", DataTypes.STRING()))) - .build(); - List result = - runFormat( - FlinkSource.forRowData() - .tableLoader(tableLoader()) - .project(projectedSchema) - .buildFormat()); - - List expected = Lists.newArrayList(); - for (Record record : records) { - Row nested = Row.of(((Record) record.get(1)).get(1)); - expected.add(Row.of(nested)); - } - - TestHelpers.assertRows(result, expected); - } - - @TestTemplate - public void testValidation() { - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, SCHEMA); - - assertThatThrownBy( - () -> - FlinkSource.forRowData() - .env(StreamExecutionEnvironment.getExecutionEnvironment()) - .tableLoader(tableLoader()) - .streaming(false) - .endTag("tag") - .endSnapshotId(1L) - .build()) - .hasMessage("END_SNAPSHOT_ID and END_TAG cannot both be set.") - .isInstanceOf(IllegalArgumentException.class); - } - - private List runFormat(FlinkInputFormat inputFormat) throws IOException { - RowType rowType = FlinkSchemaUtil.convert(inputFormat.projectedSchema()); - return TestHelpers.readRows(inputFormat, rowType); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java deleted file mode 100644 index 1b4fc863631f..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.Map; -import org.apache.flink.table.types.logical.RowType; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.CatalogLoader; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.StructLikeSet; - -public class TestFlinkInputFormatReaderDeletes extends TestFlinkReaderDeletesBase { - - @Override - protected StructLikeSet rowSet(String tableName, Table testTable, String... columns) - throws IOException { - Schema projected = testTable.schema().select(columns); - RowType rowType = FlinkSchemaUtil.convert(projected); - Map properties = Maps.newHashMap(); - properties.put( - CatalogProperties.WAREHOUSE_LOCATION, - hiveConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); - properties.put(CatalogProperties.URI, hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname)); - properties.put( - CatalogProperties.CLIENT_POOL_SIZE, - Integer.toString(hiveConf.getInt("iceberg.hive.client-pool-size", 5))); - CatalogLoader hiveCatalogLoader = CatalogLoader.hive(catalog.name(), hiveConf, properties); - FlinkInputFormat inputFormat = - FlinkSource.forRowData() - .tableLoader( - TableLoader.fromCatalog( - hiveCatalogLoader, TableIdentifier.of("default", tableName))) - .project(FlinkSchemaUtil.toSchema(rowType)) - .buildFormat(); - - StructLikeSet set = StructLikeSet.create(projected.asStruct()); - TestHelpers.readRowData(inputFormat, rowType) - .forEach( - rowData -> { - RowDataWrapper wrapper = new RowDataWrapper(rowType, projected.asStruct()); - set.add(wrapper.wrap(rowData)); - }); - - return set; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java deleted file mode 100644 index 59a4c3118cdf..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Files; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Table; -import org.apache.iceberg.TestMergingMetrics; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.RowDataConverter; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.sink.FlinkAppenderFactory; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.extension.RegisterExtension; - -public class TestFlinkMergingMetrics extends TestMergingMetrics { - - @RegisterExtension - private static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension("test_db", "test_table"); - - @Override - protected FileAppender writeAndGetAppender(List records) throws IOException { - Table table = CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, SCHEMA); - RowType flinkSchema = FlinkSchemaUtil.convert(SCHEMA); - FileAppender appender = - new FlinkAppenderFactory( - table, - SCHEMA, - flinkSchema, - ImmutableMap.of(), - PartitionSpec.unpartitioned(), - null, - null, - null) - .newAppender( - Files.localOutput(File.createTempFile("junit", null, tempDir)), fileFormat); - try (FileAppender fileAppender = appender) { - records.stream().map(r -> RowDataConverter.convert(SCHEMA, r)).forEach(fileAppender::add); - } - return appender; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java deleted file mode 100644 index 8352924d042a..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java +++ /dev/null @@ -1,813 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; -import java.time.Instant; -import java.util.Comparator; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import org.apache.avro.generic.GenericData; -import org.apache.commons.collections.ListUtils; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.types.Row; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileContent; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Files; -import org.apache.iceberg.HasTableOperations; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.MetadataTableType; -import org.apache.iceberg.MetadataTableUtils; -import org.apache.iceberg.MetricsUtil; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.FileHelpers; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.CatalogTestBase; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.InputFile; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.SnapshotUtil; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.io.TempDir; - -public class TestFlinkMetaDataTable extends CatalogTestBase { - private static final String TABLE_NAME = "test_table"; - private final FileFormat format = FileFormat.AVRO; - private @TempDir Path temp; - - @Parameter(index = 2) - private Boolean isPartition; - - @Parameters(name = "catalogName={0}, baseNamespace={1}, isPartition={2}") - protected static List parameters() { - List parameters = Lists.newArrayList(); - - for (Boolean isPartition : new Boolean[] {true, false}) { - String catalogName = "testhadoop"; - Namespace baseNamespace = Namespace.of("default"); - parameters.add(new Object[] {catalogName, baseNamespace, isPartition}); - } - return parameters; - } - - @Override - protected TableEnvironment getTableEnv() { - Configuration configuration = super.getTableEnv().getConfig().getConfiguration(); - configuration.set(CoreOptions.DEFAULT_PARALLELISM, 1); - return super.getTableEnv(); - } - - @BeforeEach - public void before() { - super.before(); - sql("USE CATALOG %s", catalogName); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE %s", DATABASE); - if (isPartition) { - sql( - "CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) PARTITIONED BY (data) WITH ('format-version'='2', 'write.format.default'='%s')", - TABLE_NAME, format.name()); - sql("INSERT INTO %s VALUES (1,'a',10),(2,'a',20)", TABLE_NAME); - sql("INSERT INTO %s VALUES (1,'b',10),(2,'b',20)", TABLE_NAME); - } else { - sql( - "CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) WITH ('format-version'='2', 'write.format.default'='%s')", - TABLE_NAME, format.name()); - sql( - "INSERT INTO %s VALUES (1,'iceberg',10),(2,'b',20),(3,CAST(NULL AS VARCHAR),30)", - TABLE_NAME); - sql("INSERT INTO %s VALUES (4,'iceberg',10)", TABLE_NAME); - } - } - - @Override - @AfterEach - public void clean() { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME); - dropDatabase(flinkDatabase, true); - super.clean(); - } - - @TestTemplate - public void testSnapshots() { - String sql = String.format("SELECT * FROM %s$snapshots ", TABLE_NAME); - List result = sql(sql); - - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - Iterator snapshots = table.snapshots().iterator(); - for (Row row : result) { - Snapshot next = snapshots.next(); - assertThat(((Instant) row.getField(0)).toEpochMilli()) - .as("Should have expected timestamp") - .isEqualTo(next.timestampMillis()); - assertThat(next.snapshotId()) - .as("Should have expected snapshot id") - .isEqualTo(next.snapshotId()); - assertThat(row.getField(2)).as("Should have expected parent id").isEqualTo(next.parentId()); - assertThat(row.getField(3)).as("Should have expected operation").isEqualTo(next.operation()); - assertThat(row.getField(4)) - .as("Should have expected manifest list location") - .isEqualTo(next.manifestListLocation()); - assertThat(row.getField(5)).as("Should have expected summary").isEqualTo(next.summary()); - } - } - - @TestTemplate - public void testHistory() { - String sql = String.format("SELECT * FROM %s$history ", TABLE_NAME); - List result = sql(sql); - - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - Iterator snapshots = table.snapshots().iterator(); - for (Row row : result) { - Snapshot next = snapshots.next(); - assertThat(((Instant) row.getField(0)).toEpochMilli()) - .as("Should have expected made_current_at") - .isEqualTo(next.timestampMillis()); - assertThat(row.getField(1)) - .as("Should have expected snapshot id") - .isEqualTo(next.snapshotId()); - assertThat(row.getField(2)).as("Should have expected parent id").isEqualTo(next.parentId()); - assertThat(row.getField(3)) - .as("Should have expected is current ancestor") - .isEqualTo( - SnapshotUtil.isAncestorOf( - table, table.currentSnapshot().snapshotId(), next.snapshotId())); - } - } - - @TestTemplate - public void testManifests() { - String sql = String.format("SELECT * FROM %s$manifests ", TABLE_NAME); - List result = sql(sql); - - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - List expectedDataManifests = dataManifests(table); - - for (int i = 0; i < result.size(); i++) { - Row row = result.get(i); - ManifestFile manifestFile = expectedDataManifests.get(i); - assertThat(row.getField(0)) - .as("Should have expected content") - .isEqualTo(manifestFile.content().id()); - assertThat(row.getField(1)).as("Should have expected path").isEqualTo(manifestFile.path()); - assertThat(row.getField(2)) - .as("Should have expected length") - .isEqualTo(manifestFile.length()); - assertThat(row.getField(3)) - .as("Should have expected partition_spec_id") - .isEqualTo(manifestFile.partitionSpecId()); - assertThat(row.getField(4)) - .as("Should have expected added_snapshot_id") - .isEqualTo(manifestFile.snapshotId()); - assertThat(row.getField(5)) - .as("Should have expected added_data_files_count") - .isEqualTo(manifestFile.addedFilesCount()); - assertThat(row.getField(6)) - .as("Should have expected existing_data_files_count") - .isEqualTo(manifestFile.existingFilesCount()); - assertThat(row.getField(7)) - .as("Should have expected deleted_data_files_count") - .isEqualTo(manifestFile.deletedFilesCount()); - } - } - - @TestTemplate - public void testAllManifests() { - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - - String sql = String.format("SELECT * FROM %s$all_manifests ", TABLE_NAME); - List result = sql(sql); - - List expectedDataManifests = allDataManifests(table); - - assertThat(expectedDataManifests).hasSize(result.size()); - for (int i = 0; i < result.size(); i++) { - Row row = result.get(i); - ManifestFile manifestFile = expectedDataManifests.get(i); - assertThat(row.getField(0)) - .as("Should have expected content") - .isEqualTo(manifestFile.content().id()); - assertThat(row.getField(1)).as("Should have expected path").isEqualTo(manifestFile.path()); - assertThat(row.getField(2)) - .as("Should have expected length") - .isEqualTo(manifestFile.length()); - assertThat(row.getField(3)) - .as("Should have expected partition_spec_id") - .isEqualTo(manifestFile.partitionSpecId()); - assertThat(row.getField(4)) - .as("Should have expected added_snapshot_id") - .isEqualTo(manifestFile.snapshotId()); - assertThat(row.getField(5)) - .as("Should have expected added_data_files_count") - .isEqualTo(manifestFile.addedFilesCount()); - assertThat(row.getField(6)) - .as("Should have expected existing_data_files_count") - .isEqualTo(manifestFile.existingFilesCount()); - assertThat(row.getField(7)) - .as("Should have expected deleted_data_files_count") - .isEqualTo(manifestFile.deletedFilesCount()); - } - } - - @TestTemplate - public void testUnPartitionedTable() throws IOException { - assumeThat(isPartition).isFalse(); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - - Schema deleteRowSchema = table.schema().select("id"); - Record dataDelete = GenericRecord.create(deleteRowSchema); - List dataDeletes = Lists.newArrayList(dataDelete.copy("id", 1)); - File testFile = File.createTempFile("junit", null, temp.toFile()); - DeleteFile eqDeletes = - FileHelpers.writeDeleteFile( - table, Files.localOutput(testFile), dataDeletes, deleteRowSchema); - table.newRowDelta().addDeletes(eqDeletes).commit(); - - List expectedDataManifests = dataManifests(table); - List expectedDeleteManifests = deleteManifests(table); - - assertThat(expectedDataManifests).hasSize(2); - assertThat(expectedDeleteManifests).hasSize(1); - - Schema entriesTableSchema = - MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("entries")) - .schema(); - - // check delete files table - Schema deleteFilesTableSchema = - MetadataTableUtils.createMetadataTableInstance( - table, MetadataTableType.from("delete_files")) - .schema(); - - List deleteColumns = - deleteFilesTableSchema.columns().stream() - .map(Types.NestedField::name) - .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) - .collect(Collectors.toList()); - String deleteNames = - deleteColumns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); - - deleteFilesTableSchema = deleteFilesTableSchema.select(deleteColumns); - - List actualDeleteFiles = sql("SELECT %s FROM %s$delete_files", deleteNames, TABLE_NAME); - assertThat(actualDeleteFiles).hasSize(1); - assertThat(expectedDeleteManifests).as("Should have 1 delete manifest").hasSize(1); - - List expectedDeleteFiles = - expectedEntries( - table, FileContent.EQUALITY_DELETES, entriesTableSchema, expectedDeleteManifests, null); - assertThat(expectedDeleteFiles).as("Should be 1 delete file manifest entry").hasSize(1); - TestHelpers.assertEquals( - deleteFilesTableSchema, expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); - - // Check data files table - Schema filesTableSchema = - MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("files")) - .schema(); - - List columns = - filesTableSchema.columns().stream() - .map(Types.NestedField::name) - .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) - .collect(Collectors.toList()); - String names = columns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); - - filesTableSchema = filesTableSchema.select(columns); - - List actualDataFiles = sql("SELECT %s FROM %s$data_files", names, TABLE_NAME); - assertThat(actualDataFiles).as("Metadata table should return 2 data file").hasSize(2); - List expectedDataFiles = - expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, null); - assertThat(expectedDataFiles).as("Should be 2 data file manifest entry").hasSize(2); - TestHelpers.assertEquals(filesTableSchema, expectedDataFiles.get(0), actualDataFiles.get(0)); - - // check all files table - List actualFiles = sql("SELECT %s FROM %s$files ORDER BY content", names, TABLE_NAME); - assertThat(actualFiles).as("Metadata table should return 3 files").hasSize(3); - List expectedFiles = - Stream.concat(expectedDataFiles.stream(), expectedDeleteFiles.stream()) - .collect(Collectors.toList()); - assertThat(expectedFiles).as("Should have 3 files manifest entriess").hasSize(3); - TestHelpers.assertEquals(filesTableSchema, expectedFiles.get(0), actualFiles.get(0)); - TestHelpers.assertEquals(filesTableSchema, expectedFiles.get(1), actualFiles.get(1)); - } - - @TestTemplate - public void testPartitionedTable() throws Exception { - assumeThat(isPartition).isTrue(); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - - Schema deleteRowSchema = table.schema().select("id", "data"); - Record dataDelete = GenericRecord.create(deleteRowSchema); - - Map deleteRow = Maps.newHashMap(); - deleteRow.put("id", 1); - deleteRow.put("data", "a"); - File testFile = File.createTempFile("junit", null, temp.toFile()); - DeleteFile eqDeletes = - FileHelpers.writeDeleteFile( - table, - Files.localOutput(testFile), - org.apache.iceberg.TestHelpers.Row.of("a"), - Lists.newArrayList(dataDelete.copy(deleteRow)), - deleteRowSchema); - table.newRowDelta().addDeletes(eqDeletes).commit(); - - deleteRow.put("data", "b"); - File testFile2 = File.createTempFile("junit", null, temp.toFile()); - DeleteFile eqDeletes2 = - FileHelpers.writeDeleteFile( - table, - Files.localOutput(testFile2), - org.apache.iceberg.TestHelpers.Row.of("b"), - Lists.newArrayList(dataDelete.copy(deleteRow)), - deleteRowSchema); - table.newRowDelta().addDeletes(eqDeletes2).commit(); - - Schema entriesTableSchema = - MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("entries")) - .schema(); - - List expectedDataManifests = dataManifests(table); - List expectedDeleteManifests = deleteManifests(table); - - assertThat(expectedDataManifests).hasSize(2); - assertThat(expectedDeleteManifests).hasSize(2); - Table deleteFilesTable = - MetadataTableUtils.createMetadataTableInstance( - table, MetadataTableType.from("delete_files")); - Schema filesTableSchema = deleteFilesTable.schema(); - - List columns = - filesTableSchema.columns().stream() - .map(Types.NestedField::name) - .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) - .collect(Collectors.toList()); - String names = columns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); - - filesTableSchema = filesTableSchema.select(columns); - - // Check delete files table - List expectedDeleteFiles = - expectedEntries( - table, FileContent.EQUALITY_DELETES, entriesTableSchema, expectedDeleteManifests, "a"); - assertThat(expectedDeleteFiles).hasSize(1); - List actualDeleteFiles = - sql("SELECT %s FROM %s$delete_files WHERE `partition`.`data`='a'", names, TABLE_NAME); - - assertThat(actualDeleteFiles).hasSize(1); - TestHelpers.assertEquals( - filesTableSchema, expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); - - // Check data files table - List expectedDataFiles = - expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, "a"); - assertThat(expectedDataFiles).hasSize(1); - List actualDataFiles = - sql("SELECT %s FROM %s$data_files WHERE `partition`.`data`='a'", names, TABLE_NAME); - assertThat(actualDataFiles).hasSize(1); - TestHelpers.assertEquals(filesTableSchema, expectedDataFiles.get(0), actualDataFiles.get(0)); - - List actualPartitionsWithProjection = - sql("SELECT file_count FROM %s$partitions ", TABLE_NAME); - assertThat(actualPartitionsWithProjection).hasSize(2); - for (int i = 0; i < 2; ++i) { - assertThat(actualPartitionsWithProjection.get(i).getField(0)).isEqualTo(1); - } - - // Check files table - List expectedFiles = - Stream.concat(expectedDataFiles.stream(), expectedDeleteFiles.stream()) - .collect(Collectors.toList()); - assertThat(expectedFiles).hasSize(2); - List actualFiles = - sql( - "SELECT %s FROM %s$files WHERE `partition`.`data`='a' ORDER BY content", - names, TABLE_NAME); - assertThat(actualFiles).hasSize(2); - TestHelpers.assertEquals(filesTableSchema, expectedFiles.get(0), actualFiles.get(0)); - TestHelpers.assertEquals(filesTableSchema, expectedFiles.get(1), actualFiles.get(1)); - } - - @TestTemplate - public void testAllFilesUnpartitioned() throws Exception { - assumeThat(isPartition).isFalse(); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - - Schema deleteRowSchema = table.schema().select("id", "data"); - Record dataDelete = GenericRecord.create(deleteRowSchema); - - Map deleteRow = Maps.newHashMap(); - deleteRow.put("id", 1); - File testFile = File.createTempFile("junit", null, temp.toFile()); - DeleteFile eqDeletes = - FileHelpers.writeDeleteFile( - table, - Files.localOutput(testFile), - Lists.newArrayList(dataDelete.copy(deleteRow)), - deleteRowSchema); - table.newRowDelta().addDeletes(eqDeletes).commit(); - - List expectedDataManifests = dataManifests(table); - assertThat(expectedDataManifests).hasSize(2); - List expectedDeleteManifests = deleteManifests(table); - assertThat(expectedDeleteManifests).hasSize(1); - - // Clear table to test whether 'all_files' can read past files - table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); - - Schema entriesTableSchema = - MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("entries")) - .schema(); - Schema filesTableSchema = - MetadataTableUtils.createMetadataTableInstance( - table, MetadataTableType.from("all_data_files")) - .schema(); - - List columns = - filesTableSchema.columns().stream() - .map(Types.NestedField::name) - .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) - .collect(Collectors.toList()); - String names = columns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); - - filesTableSchema = filesTableSchema.select(columns); - - // Check all data files table - List actualDataFiles = - sql("SELECT %s FROM %s$all_data_files order by record_count ", names, TABLE_NAME); - - List expectedDataFiles = - expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, null); - assertThat(expectedDataFiles).hasSize(2); - assertThat(actualDataFiles).hasSize(2); - TestHelpers.assertEquals(filesTableSchema, expectedDataFiles, actualDataFiles); - - // Check all delete files table - List actualDeleteFiles = sql("SELECT %s FROM %s$all_delete_files", names, TABLE_NAME); - List expectedDeleteFiles = - expectedEntries( - table, FileContent.EQUALITY_DELETES, entriesTableSchema, expectedDeleteManifests, null); - assertThat(expectedDeleteFiles).hasSize(1); - assertThat(actualDeleteFiles).hasSize(1); - TestHelpers.assertEquals( - filesTableSchema, expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); - - // Check all files table - List actualFiles = - sql("SELECT %s FROM %s$all_files ORDER BY content, record_count asc", names, TABLE_NAME); - List expectedFiles = - ListUtils.union(expectedDataFiles, expectedDeleteFiles); - expectedFiles.sort(Comparator.comparing(r -> ((Integer) r.get("content")))); - assertThat(actualFiles).hasSize(3); - TestHelpers.assertEquals(filesTableSchema, expectedFiles, actualFiles); - } - - @TestTemplate - public void testAllFilesPartitioned() throws Exception { - assumeThat(!isPartition).isFalse(); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - - // Create delete file - Schema deleteRowSchema = table.schema().select("id"); - Record dataDelete = GenericRecord.create(deleteRowSchema); - - Map deleteRow = Maps.newHashMap(); - deleteRow.put("id", 1); - File testFile = File.createTempFile("junit", null, temp.toFile()); - DeleteFile eqDeletes = - FileHelpers.writeDeleteFile( - table, - Files.localOutput(testFile), - org.apache.iceberg.TestHelpers.Row.of("a"), - Lists.newArrayList(dataDelete.copy(deleteRow)), - deleteRowSchema); - File testFile2 = File.createTempFile("junit", null, temp.toFile()); - DeleteFile eqDeletes2 = - FileHelpers.writeDeleteFile( - table, - Files.localOutput(testFile2), - org.apache.iceberg.TestHelpers.Row.of("b"), - Lists.newArrayList(dataDelete.copy(deleteRow)), - deleteRowSchema); - table.newRowDelta().addDeletes(eqDeletes).addDeletes(eqDeletes2).commit(); - - List expectedDataManifests = dataManifests(table); - assertThat(expectedDataManifests).hasSize(2); - List expectedDeleteManifests = deleteManifests(table); - assertThat(expectedDeleteManifests).hasSize(1); - // Clear table to test whether 'all_files' can read past files - table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); - - Schema entriesTableSchema = - MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("entries")) - .schema(); - Schema filesTableSchema = - MetadataTableUtils.createMetadataTableInstance( - table, MetadataTableType.from("all_data_files")) - .schema(); - - List columns = - filesTableSchema.columns().stream() - .map(Types.NestedField::name) - .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) - .collect(Collectors.toList()); - String names = columns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); - - filesTableSchema = filesTableSchema.select(columns); - - // Check all data files table - List actualDataFiles = - sql("SELECT %s FROM %s$all_data_files WHERE `partition`.`data`='a'", names, TABLE_NAME); - List expectedDataFiles = - expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, "a"); - assertThat(expectedDataFiles).hasSize(1); - assertThat(actualDataFiles).hasSize(1); - TestHelpers.assertEquals(filesTableSchema, expectedDataFiles.get(0), actualDataFiles.get(0)); - - // Check all delete files table - List actualDeleteFiles = - sql("SELECT %s FROM %s$all_delete_files WHERE `partition`.`data`='a'", names, TABLE_NAME); - List expectedDeleteFiles = - expectedEntries( - table, FileContent.EQUALITY_DELETES, entriesTableSchema, expectedDeleteManifests, "a"); - assertThat(expectedDeleteFiles).hasSize(1); - assertThat(actualDeleteFiles).hasSize(1); - TestHelpers.assertEquals( - filesTableSchema, expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); - - // Check all files table - List actualFiles = - sql( - "SELECT %s FROM %s$all_files WHERE `partition`.`data`='a' ORDER BY content", - names, TABLE_NAME); - List expectedFiles = - ListUtils.union(expectedDataFiles, expectedDeleteFiles); - expectedFiles.sort(Comparator.comparing(r -> ((Integer) r.get("content")))); - assertThat(actualFiles).hasSize(2); - TestHelpers.assertEquals(filesTableSchema, expectedFiles, actualFiles); - } - - @TestTemplate - public void testMetadataLogEntries() { - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - - Long currentSnapshotId = table.currentSnapshot().snapshotId(); - TableMetadata tableMetadata = ((HasTableOperations) table).operations().current(); - Snapshot currentSnapshot = tableMetadata.currentSnapshot(); - Snapshot parentSnapshot = table.snapshot(currentSnapshot.parentId()); - List metadataLogEntries = - Lists.newArrayList(tableMetadata.previousFiles()); - - // Check metadataLog table - List metadataLogs = sql("SELECT * FROM %s$metadata_log_entries", TABLE_NAME); - - assertThat(metadataLogs).hasSize(3); - Row metadataLog = metadataLogs.get(0); - assertThat(metadataLog.getField("timestamp")) - .isEqualTo(Instant.ofEpochMilli(metadataLogEntries.get(0).timestampMillis())); - assertThat(metadataLog.getField("file")).isEqualTo(metadataLogEntries.get(0).file()); - assertThat(metadataLog.getField("latest_snapshot_id")).isNull(); - assertThat(metadataLog.getField("latest_schema_id")).isNull(); - assertThat(metadataLog.getField("latest_sequence_number")).isNull(); - - metadataLog = metadataLogs.get(1); - assertThat(metadataLog.getField("timestamp")) - .isEqualTo(Instant.ofEpochMilli(metadataLogEntries.get(1).timestampMillis())); - assertThat(metadataLog.getField("file")).isEqualTo(metadataLogEntries.get(1).file()); - assertThat(metadataLog.getField("latest_snapshot_id")).isEqualTo(parentSnapshot.snapshotId()); - assertThat(metadataLog.getField("latest_schema_id")).isEqualTo(parentSnapshot.schemaId()); - assertThat(metadataLog.getField("latest_sequence_number")) - .isEqualTo(parentSnapshot.sequenceNumber()); - assertThat(metadataLog.getField("latest_snapshot_id")).isEqualTo(parentSnapshot.snapshotId()); - - metadataLog = metadataLogs.get(2); - assertThat(metadataLog.getField("timestamp")) - .isEqualTo(Instant.ofEpochMilli(currentSnapshot.timestampMillis())); - assertThat(metadataLog.getField("file")).isEqualTo(tableMetadata.metadataFileLocation()); - assertThat(metadataLog.getField("latest_snapshot_id")).isEqualTo(currentSnapshot.snapshotId()); - assertThat(metadataLog.getField("latest_schema_id")).isEqualTo(currentSnapshot.schemaId()); - assertThat(metadataLog.getField("latest_sequence_number")) - .isEqualTo(currentSnapshot.sequenceNumber()); - - // test filtering - List metadataLogWithFilters = - sql( - "SELECT * FROM %s$metadata_log_entries WHERE latest_snapshot_id = %s", - TABLE_NAME, currentSnapshotId); - assertThat(metadataLogWithFilters).hasSize(1); - metadataLog = metadataLogWithFilters.get(0); - assertThat(Instant.ofEpochMilli(tableMetadata.currentSnapshot().timestampMillis())) - .isEqualTo(metadataLog.getField("timestamp")); - - assertThat(metadataLog.getField("file")).isEqualTo(tableMetadata.metadataFileLocation()); - assertThat(metadataLog.getField("latest_snapshot_id")) - .isEqualTo(tableMetadata.currentSnapshot().snapshotId()); - assertThat(metadataLog.getField("latest_schema_id")) - .isEqualTo(tableMetadata.currentSnapshot().schemaId()); - assertThat(metadataLog.getField("latest_sequence_number")) - .isEqualTo(tableMetadata.currentSnapshot().sequenceNumber()); - - // test projection - List metadataFiles = - metadataLogEntries.stream() - .map(TableMetadata.MetadataLogEntry::file) - .collect(Collectors.toList()); - metadataFiles.add(tableMetadata.metadataFileLocation()); - List metadataLogWithProjection = - sql("SELECT file FROM %s$metadata_log_entries", TABLE_NAME); - assertThat(metadataLogWithProjection).hasSize(3); - for (int i = 0; i < metadataFiles.size(); i++) { - assertThat(metadataLogWithProjection.get(i).getField("file")).isEqualTo(metadataFiles.get(i)); - } - } - - @TestTemplate - public void testSnapshotReferencesMetatable() { - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - - Long currentSnapshotId = table.currentSnapshot().snapshotId(); - - // Create branch - table - .manageSnapshots() - .createBranch("testBranch", currentSnapshotId) - .setMaxRefAgeMs("testBranch", 10) - .setMinSnapshotsToKeep("testBranch", 20) - .setMaxSnapshotAgeMs("testBranch", 30) - .commit(); - // Create Tag - table - .manageSnapshots() - .createTag("testTag", currentSnapshotId) - .setMaxRefAgeMs("testTag", 50) - .commit(); - // Check refs table - List references = sql("SELECT * FROM %s$refs", TABLE_NAME); - List branches = sql("SELECT * FROM %s$refs WHERE type='BRANCH'", TABLE_NAME); - assertThat(references).hasSize(3); - assertThat(branches).hasSize(2); - List tags = sql("SELECT * FROM %s$refs WHERE type='TAG'", TABLE_NAME); - assertThat(tags).hasSize(1); - // Check branch entries in refs table - List mainBranch = - sql("SELECT * FROM %s$refs WHERE name='main' AND type='BRANCH'", TABLE_NAME); - assertThat((String) mainBranch.get(0).getFieldAs("name")).isEqualTo("main"); - assertThat((String) mainBranch.get(0).getFieldAs("type")).isEqualTo("BRANCH"); - assertThat((Long) mainBranch.get(0).getFieldAs("snapshot_id")).isEqualTo(currentSnapshotId); - List testBranch = - sql("SELECT * FROM %s$refs WHERE name='testBranch' AND type='BRANCH'", TABLE_NAME); - assertThat((String) testBranch.get(0).getFieldAs("name")).isEqualTo("testBranch"); - assertThat((String) testBranch.get(0).getFieldAs("type")).isEqualTo("BRANCH"); - assertThat((Long) testBranch.get(0).getFieldAs("snapshot_id")).isEqualTo(currentSnapshotId); - assertThat((Long) testBranch.get(0).getFieldAs("max_reference_age_in_ms")) - .isEqualTo(Long.valueOf(10)); - assertThat((Integer) testBranch.get(0).getFieldAs("min_snapshots_to_keep")) - .isEqualTo(Integer.valueOf(20)); - assertThat((Long) testBranch.get(0).getFieldAs("max_snapshot_age_in_ms")) - .isEqualTo(Long.valueOf(30)); - - // Check tag entries in refs table - List testTag = - sql("SELECT * FROM %s$refs WHERE name='testTag' AND type='TAG'", TABLE_NAME); - assertThat((String) testTag.get(0).getFieldAs("name")).isEqualTo("testTag"); - assertThat((String) testTag.get(0).getFieldAs("type")).isEqualTo("TAG"); - assertThat((Long) testTag.get(0).getFieldAs("snapshot_id")).isEqualTo(currentSnapshotId); - assertThat((Long) testTag.get(0).getFieldAs("max_reference_age_in_ms")) - .isEqualTo(Long.valueOf(50)); - // Check projection in refs table - List testTagProjection = - sql( - "SELECT name,type,snapshot_id,max_reference_age_in_ms,min_snapshots_to_keep FROM %s$refs where type='TAG'", - TABLE_NAME); - assertThat((String) testTagProjection.get(0).getFieldAs("name")).isEqualTo("testTag"); - assertThat((String) testTagProjection.get(0).getFieldAs("type")).isEqualTo("TAG"); - assertThat((Long) testTagProjection.get(0).getFieldAs("snapshot_id")) - .isEqualTo(currentSnapshotId); - assertThat((Long) testTagProjection.get(0).getFieldAs("max_reference_age_in_ms")) - .isEqualTo(Long.valueOf(50)); - assertThat((String) testTagProjection.get(0).getFieldAs("min_snapshots_to_keep")).isNull(); - List mainBranchProjection = - sql("SELECT name, type FROM %s$refs WHERE name='main' AND type = 'BRANCH'", TABLE_NAME); - assertThat((String) mainBranchProjection.get(0).getFieldAs("name")).isEqualTo("main"); - assertThat((String) mainBranchProjection.get(0).getFieldAs("type")).isEqualTo("BRANCH"); - List testBranchProjection = - sql( - "SELECT type, name, max_reference_age_in_ms, snapshot_id FROM %s$refs WHERE name='testBranch' AND type = 'BRANCH'", - TABLE_NAME); - assertThat((String) testBranchProjection.get(0).getFieldAs("name")).isEqualTo("testBranch"); - assertThat((String) testBranchProjection.get(0).getFieldAs("type")).isEqualTo("BRANCH"); - assertThat((Long) testBranchProjection.get(0).getFieldAs("snapshot_id")) - .isEqualTo(currentSnapshotId); - assertThat((Long) testBranchProjection.get(0).getFieldAs("max_reference_age_in_ms")) - .isEqualTo(Long.valueOf(10)); - } - - /** - * Find matching manifest entries of an Iceberg table - * - * @param table iceberg table - * @param expectedContent file content to populate on entries - * @param entriesTableSchema schema of Manifest entries - * @param manifestsToExplore manifests to explore of the table - * @param partValue partition value that manifest entries must match, or null to skip filtering - */ - private List expectedEntries( - Table table, - FileContent expectedContent, - Schema entriesTableSchema, - List manifestsToExplore, - String partValue) - throws IOException { - List expected = Lists.newArrayList(); - for (ManifestFile manifest : manifestsToExplore) { - InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = - Avro.read(in).project(entriesTableSchema).build()) { - for (GenericData.Record record : rows) { - if ((Integer) record.get("status") < 2 /* added or existing */) { - GenericData.Record file = (GenericData.Record) record.get("data_file"); - if (partitionMatch(file, partValue)) { - asMetadataRecord(file, expectedContent); - expected.add(file); - } - } - } - } - } - return expected; - } - - // Populate certain fields derived in the metadata tables - private void asMetadataRecord(GenericData.Record file, FileContent content) { - file.put(0, content.id()); - file.put(3, 0); // specId - } - - private boolean partitionMatch(GenericData.Record file, String partValue) { - if (partValue == null) { - return true; - } - GenericData.Record partition = (GenericData.Record) file.get(4); - return partValue.equals(partition.get(0).toString()); - } - - private List dataManifests(Table table) { - return table.currentSnapshot().dataManifests(table.io()); - } - - private List allDataManifests(Table table) { - List manifests = Lists.newArrayList(); - for (Snapshot snapshot : table.snapshots()) { - manifests.addAll(snapshot.dataManifests(table.io())); - } - return manifests; - } - - private List deleteManifests(Table table) { - return table.currentSnapshot().deleteManifests(table.io()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java deleted file mode 100644 index 188a44d7cdba..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.Map; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.BaseTable; -import org.apache.iceberg.CatalogUtil; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.DeleteReadTests; -import org.apache.iceberg.hive.HiveCatalog; -import org.apache.iceberg.hive.TestHiveMetastore; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public abstract class TestFlinkReaderDeletesBase extends DeleteReadTests { - - protected static String databaseName = "default"; - - protected static HiveConf hiveConf = null; - protected static HiveCatalog catalog = null; - private static TestHiveMetastore metastore = null; - - @BeforeAll - public static void startMetastore() { - metastore = new TestHiveMetastore(); - metastore.start(); - hiveConf = metastore.hiveConf(); - catalog = - (HiveCatalog) - CatalogUtil.loadCatalog( - HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); - } - - @AfterAll - public static void stopMetastore() throws Exception { - metastore.stop(); - catalog = null; - } - - @Override - protected Table createTable(String name, Schema schema, PartitionSpec spec) { - Map props = Maps.newHashMap(); - props.put(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - - Table table = catalog.createTable(TableIdentifier.of(databaseName, name), schema, spec, props); - TableOperations ops = ((BaseTable) table).operations(); - TableMetadata meta = ops.current(); - ops.commit(meta, meta.upgradeToFormatVersion(formatVersion)); - - return table; - } - - @Override - protected void dropTable(String name) { - catalog.dropTable(TableIdentifier.of(databaseName, name)); - } - - @Override - protected boolean expectPruned() { - return false; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java deleted file mode 100644 index cf6b233dcec6..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java +++ /dev/null @@ -1,540 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.nio.file.Path; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.DateTimeUtil; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public abstract class TestFlinkScan { - @RegisterExtension - protected static MiniClusterExtension miniClusterExtension = - MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); - - @TempDir protected Path temporaryDirectory; - - @RegisterExtension - protected static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); - - @Parameter protected FileFormat fileFormat; - - @Parameters(name = "format={0}") - public static Collection fileFormat() { - return Arrays.asList(FileFormat.AVRO, FileFormat.PARQUET, FileFormat.ORC); - } - - protected TableLoader tableLoader() { - return CATALOG_EXTENSION.tableLoader(); - } - - protected abstract List runWithProjection(String... projected) throws Exception; - - protected abstract List runWithFilter( - Expression filter, String sqlFilter, boolean caseSensitive) throws Exception; - - protected List runWithFilter(Expression filter, String sqlFilter) throws Exception { - return runWithFilter(filter, sqlFilter, true); - } - - protected abstract List runWithOptions(Map options) throws Exception; - - protected abstract List run() throws Exception; - - @TestTemplate - public void testUnpartitionedTable() throws Exception { - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); - new GenericAppenderHelper(table, fileFormat, temporaryDirectory).appendToTable(expectedRecords); - TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); - } - - @TestTemplate - public void testPartitionedTable() throws Exception { - Table table = - CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - expectedRecords.get(0).set(2, "2020-03-20"); - new GenericAppenderHelper(table, fileFormat, temporaryDirectory) - .appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); - TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); - } - - @TestTemplate - public void testProjection() throws Exception { - Table table = - CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); - List inputRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - new GenericAppenderHelper(table, fileFormat, temporaryDirectory) - .appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), inputRecords); - assertRows(runWithProjection("data"), Row.of(inputRecords.get(0).get(0))); - } - - @TestTemplate - public void testIdentityPartitionProjections() throws Exception { - Schema logSchema = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get()), - Types.NestedField.optional(3, "level", Types.StringType.get()), - Types.NestedField.optional(4, "message", Types.StringType.get())); - PartitionSpec spec = - PartitionSpec.builderFor(logSchema).identity("dt").identity("level").build(); - - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, logSchema, spec); - List inputRecords = RandomGenericData.generate(logSchema, 10, 0L); - - int idx = 0; - AppendFiles append = table.newAppend(); - for (Record record : inputRecords) { - record.set(1, "2020-03-2" + idx); - record.set(2, Integer.toString(idx)); - append.appendFile( - new GenericAppenderHelper(table, fileFormat, temporaryDirectory) - .writeFile( - org.apache.iceberg.TestHelpers.Row.of("2020-03-2" + idx, Integer.toString(idx)), - ImmutableList.of(record))); - idx += 1; - } - append.commit(); - - // individual fields - validateIdentityPartitionProjections(table, Collections.singletonList("dt"), inputRecords); - validateIdentityPartitionProjections(table, Collections.singletonList("level"), inputRecords); - validateIdentityPartitionProjections(table, Collections.singletonList("message"), inputRecords); - validateIdentityPartitionProjections(table, Collections.singletonList("id"), inputRecords); - // field pairs - validateIdentityPartitionProjections(table, Arrays.asList("dt", "message"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("level", "message"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("dt", "level"), inputRecords); - // out-of-order pairs - validateIdentityPartitionProjections(table, Arrays.asList("message", "dt"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("message", "level"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("level", "dt"), inputRecords); - // out-of-order triplets - validateIdentityPartitionProjections( - table, Arrays.asList("dt", "level", "message"), inputRecords); - validateIdentityPartitionProjections( - table, Arrays.asList("level", "dt", "message"), inputRecords); - validateIdentityPartitionProjections( - table, Arrays.asList("dt", "message", "level"), inputRecords); - validateIdentityPartitionProjections( - table, Arrays.asList("level", "message", "dt"), inputRecords); - validateIdentityPartitionProjections( - table, Arrays.asList("message", "dt", "level"), inputRecords); - validateIdentityPartitionProjections( - table, Arrays.asList("message", "level", "dt"), inputRecords); - } - - private void validateIdentityPartitionProjections( - Table table, List projectedFields, List inputRecords) throws Exception { - List rows = runWithProjection(projectedFields.toArray(new String[0])); - - for (int pos = 0; pos < inputRecords.size(); pos++) { - Record inputRecord = inputRecords.get(pos); - Row actualRecord = rows.get(pos); - - for (int i = 0; i < projectedFields.size(); i++) { - String name = projectedFields.get(i); - assertThat(inputRecord.getField(name)) - .as("Projected field " + name + " should match") - .isEqualTo(actualRecord.getField(i)); - } - } - } - - @TestTemplate - public void testSnapshotReads() throws Exception { - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(expectedRecords); - long snapshotId = table.currentSnapshot().snapshotId(); - - long timestampMillis = table.currentSnapshot().timestampMillis(); - - // produce another timestamp - waitUntilAfter(timestampMillis); - helper.appendToTable(RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L)); - - TestHelpers.assertRecords( - runWithOptions(ImmutableMap.of("snapshot-id", Long.toString(snapshotId))), - expectedRecords, - TestFixtures.SCHEMA); - TestHelpers.assertRecords( - runWithOptions(ImmutableMap.of("as-of-timestamp", Long.toString(timestampMillis))), - expectedRecords, - TestFixtures.SCHEMA); - } - - @TestTemplate - public void testTagReads() throws Exception { - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - - List expectedRecords1 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(expectedRecords1); - long snapshotId = table.currentSnapshot().snapshotId(); - - table.manageSnapshots().createTag("t1", snapshotId).commit(); - - TestHelpers.assertRecords( - runWithOptions(ImmutableMap.of("tag", "t1")), expectedRecords1, TestFixtures.SCHEMA); - - List expectedRecords2 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(expectedRecords2); - snapshotId = table.currentSnapshot().snapshotId(); - - table.manageSnapshots().replaceTag("t1", snapshotId).commit(); - - List expectedRecords = Lists.newArrayList(); - expectedRecords.addAll(expectedRecords1); - expectedRecords.addAll(expectedRecords2); - TestHelpers.assertRecords( - runWithOptions(ImmutableMap.of("tag", "t1")), expectedRecords, TestFixtures.SCHEMA); - } - - @TestTemplate - public void testBranchReads() throws Exception { - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - - List expectedRecordsBase = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(expectedRecordsBase); - long snapshotId = table.currentSnapshot().snapshotId(); - - String branchName = "b1"; - table.manageSnapshots().createBranch(branchName, snapshotId).commit(); - - List expectedRecordsForBranch = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(branchName, expectedRecordsForBranch); - - List expectedRecordsForMain = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(expectedRecordsForMain); - - List branchExpectedRecords = Lists.newArrayList(); - branchExpectedRecords.addAll(expectedRecordsBase); - branchExpectedRecords.addAll(expectedRecordsForBranch); - - TestHelpers.assertRecords( - runWithOptions(ImmutableMap.of("branch", branchName)), - branchExpectedRecords, - TestFixtures.SCHEMA); - - List mainExpectedRecords = Lists.newArrayList(); - mainExpectedRecords.addAll(expectedRecordsBase); - mainExpectedRecords.addAll(expectedRecordsForMain); - - TestHelpers.assertRecords(run(), mainExpectedRecords, TestFixtures.SCHEMA); - } - - @TestTemplate - public void testIncrementalReadViaTag() throws Exception { - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - - List records1 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(records1); - long snapshotId1 = table.currentSnapshot().snapshotId(); - String startTag = "t1"; - table.manageSnapshots().createTag(startTag, snapshotId1).commit(); - - List records2 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1L); - helper.appendToTable(records2); - - List records3 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 2L); - helper.appendToTable(records3); - long snapshotId3 = table.currentSnapshot().snapshotId(); - String endTag = "t2"; - table.manageSnapshots().createTag(endTag, snapshotId3).commit(); - - helper.appendToTable(RandomGenericData.generate(TestFixtures.SCHEMA, 1, 3L)); - - List expected = Lists.newArrayList(); - expected.addAll(records2); - expected.addAll(records3); - - TestHelpers.assertRecords( - runWithOptions( - ImmutableMap.builder() - .put("start-tag", startTag) - .put("end-tag", endTag) - .buildOrThrow()), - expected, - TestFixtures.SCHEMA); - - TestHelpers.assertRecords( - runWithOptions( - ImmutableMap.builder() - .put("start-snapshot-id", Long.toString(snapshotId1)) - .put("end-tag", endTag) - .buildOrThrow()), - expected, - TestFixtures.SCHEMA); - - TestHelpers.assertRecords( - runWithOptions( - ImmutableMap.builder() - .put("start-tag", startTag) - .put("end-snapshot-id", Long.toString(snapshotId3)) - .buildOrThrow()), - expected, - TestFixtures.SCHEMA); - - assertThatThrownBy( - () -> - runWithOptions( - ImmutableMap.builder() - .put("start-tag", startTag) - .put("end-tag", endTag) - .put("start-snapshot-id", Long.toString(snapshotId1)) - .buildOrThrow())) - .isInstanceOf(Exception.class) - .hasMessage("START_SNAPSHOT_ID and START_TAG cannot both be set."); - - assertThatThrownBy( - () -> - runWithOptions( - ImmutableMap.builder() - .put("start-tag", startTag) - .put("end-tag", endTag) - .put("end-snapshot-id", Long.toString(snapshotId3)) - .buildOrThrow())) - .isInstanceOf(Exception.class) - .hasMessage("END_SNAPSHOT_ID and END_TAG cannot both be set."); - } - - @TestTemplate - public void testIncrementalRead() throws Exception { - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - - List records1 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(records1); - long snapshotId1 = table.currentSnapshot().snapshotId(); - - // snapshot 2 - List records2 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1L); - helper.appendToTable(records2); - - List records3 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 2L); - helper.appendToTable(records3); - long snapshotId3 = table.currentSnapshot().snapshotId(); - - // snapshot 4 - helper.appendToTable(RandomGenericData.generate(TestFixtures.SCHEMA, 1, 3L)); - - List expected2 = Lists.newArrayList(); - expected2.addAll(records2); - expected2.addAll(records3); - TestHelpers.assertRecords( - runWithOptions( - ImmutableMap.builder() - .put("start-snapshot-id", Long.toString(snapshotId1)) - .put("end-snapshot-id", Long.toString(snapshotId3)) - .buildOrThrow()), - expected2, - TestFixtures.SCHEMA); - } - - @TestTemplate - public void testFilterExpPartition() throws Exception { - Table table = - CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); - - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); - expectedRecords.get(0).set(2, "2020-03-20"); - expectedRecords.get(1).set(2, "2020-03-20"); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - DataFile dataFile1 = - helper.writeFile(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); - DataFile dataFile2 = - helper.writeFile( - org.apache.iceberg.TestHelpers.Row.of("2020-03-21", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); - helper.appendToTable(dataFile1, dataFile2); - TestHelpers.assertRecords( - runWithFilter(Expressions.equal("dt", "2020-03-20"), "where dt='2020-03-20'", true), - expectedRecords, - TestFixtures.SCHEMA); - } - - private void testFilterExp(Expression filter, String sqlFilter, boolean caseSensitive) - throws Exception { - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 3, 0L); - expectedRecords.get(0).set(0, "a"); - expectedRecords.get(1).set(0, "b"); - expectedRecords.get(2).set(0, "c"); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - DataFile dataFile = helper.writeFile(expectedRecords); - helper.appendToTable(dataFile); - - List actual = - runWithFilter(Expressions.greaterThanOrEqual("data", "b"), "where data>='b'", true); - - TestHelpers.assertRecords(actual, expectedRecords.subList(1, 3), TestFixtures.SCHEMA); - } - - @TestTemplate - public void testFilterExp() throws Exception { - testFilterExp(Expressions.greaterThanOrEqual("data", "b"), "where data>='b'", true); - } - - @TestTemplate - public void testFilterExpCaseInsensitive() throws Exception { - // sqlFilter does not support case-insensitive filtering: - // https://issues.apache.org/jira/browse/FLINK-16175 - testFilterExp(Expressions.greaterThanOrEqual("DATA", "b"), "where data>='b'", false); - } - - @TestTemplate - public void testPartitionTypes() throws Exception { - Schema typesSchema = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "decimal", Types.DecimalType.of(38, 18)), - Types.NestedField.optional(3, "str", Types.StringType.get()), - Types.NestedField.optional(4, "binary", Types.BinaryType.get()), - Types.NestedField.optional(5, "date", Types.DateType.get()), - Types.NestedField.optional(6, "time", Types.TimeType.get()), - Types.NestedField.optional(7, "timestamp", Types.TimestampType.withoutZone())); - PartitionSpec spec = - PartitionSpec.builderFor(typesSchema) - .identity("decimal") - .identity("str") - .identity("binary") - .identity("date") - .identity("time") - .identity("timestamp") - .build(); - - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, typesSchema, spec); - List records = RandomGenericData.generate(typesSchema, 10, 0L); - GenericAppenderHelper appender = - new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - for (Record record : records) { - org.apache.iceberg.TestHelpers.Row partition = - org.apache.iceberg.TestHelpers.Row.of( - record.get(1), - record.get(2), - record.get(3), - record.get(4) == null ? null : DateTimeUtil.daysFromDate((LocalDate) record.get(4)), - record.get(5) == null ? null : DateTimeUtil.microsFromTime((LocalTime) record.get(5)), - record.get(6) == null - ? null - : DateTimeUtil.microsFromTimestamp((LocalDateTime) record.get(6))); - appender.appendToTable(partition, Collections.singletonList(record)); - } - - TestHelpers.assertRecords(run(), records, typesSchema); - } - - @TestTemplate - public void testCustomizedFlinkDataTypes() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required( - 1, - "map", - Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.StringType.get())), - Types.NestedField.required( - 4, "arr", Types.ListType.ofRequired(5, Types.StringType.get()))); - Table table = CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, schema); - List records = RandomGenericData.generate(schema, 10, 0L); - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - helper.appendToTable(records); - TestHelpers.assertRecords(run(), records, schema); - } - - private static void assertRows(List results, Row... expected) { - TestHelpers.assertRows(results, Arrays.asList(expected)); - } - - private static void waitUntilAfter(long timestampMillis) { - long current = System.currentTimeMillis(); - while (current <= timestampMillis) { - current = System.currentTimeMillis(); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java deleted file mode 100644 index 1493c0932044..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.config.TableConfigOptions; -import org.apache.flink.types.Row; -import org.junit.jupiter.api.BeforeEach; - -/** Test Flink SELECT SQLs. */ -public class TestFlinkScanSql extends TestFlinkSource { - private volatile TableEnvironment tEnv; - - @BeforeEach - public void before() throws IOException { - SqlHelpers.sql( - getTableEnv(), - "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", - CATALOG_EXTENSION.warehouse()); - SqlHelpers.sql(getTableEnv(), "use catalog iceberg_catalog"); - getTableEnv() - .getConfig() - .getConfiguration() - .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); - } - - private TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - this.tEnv = - TableEnvironment.create(EnvironmentSettings.newInstance().inBatchMode().build()); - } - } - } - return tEnv; - } - - @Override - protected List run( - FlinkSource.Builder formatBuilder, - Map sqlOptions, - String sqlFilter, - String... sqlSelectedFields) { - String select = String.join(",", sqlSelectedFields); - String optionStr = SqlHelpers.sqlOptionsToString(sqlOptions); - return SqlHelpers.sql(getTableEnv(), "select %s from t %s %s", select, optionStr, sqlFilter); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java deleted file mode 100644 index dd50170f0fd7..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import org.apache.flink.table.api.TableColumn; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.types.Row; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -public abstract class TestFlinkSource extends TestFlinkScan { - - @Override - protected List runWithProjection(String... projected) throws Exception { - TableSchema.Builder builder = TableSchema.builder(); - TableSchema schema = - FlinkSchemaUtil.toSchema( - FlinkSchemaUtil.convert( - CATALOG_EXTENSION.catalog().loadTable(TestFixtures.TABLE_IDENTIFIER).schema())); - for (String field : projected) { - TableColumn column = schema.getTableColumn(field).get(); - builder.field(column.getName(), column.getType()); - } - return run(FlinkSource.forRowData().project(builder.build()), Maps.newHashMap(), "", projected); - } - - @Override - protected List runWithFilter(Expression filter, String sqlFilter, boolean caseSensitive) - throws Exception { - FlinkSource.Builder builder = - FlinkSource.forRowData().filters(Collections.singletonList(filter)); - Map options = Maps.newHashMap(); - options.put("case-sensitive", Boolean.toString(caseSensitive)); - return run(builder, options, sqlFilter, "*"); - } - - @Override - protected List runWithOptions(Map options) throws Exception { - FlinkSource.Builder builder = FlinkSource.forRowData(); - Optional.ofNullable(options.get("case-sensitive")) - .ifPresent(value -> builder.caseSensitive(Boolean.parseBoolean(value))); - Optional.ofNullable(options.get("snapshot-id")) - .ifPresent(value -> builder.snapshotId(Long.parseLong(value))); - Optional.ofNullable(options.get("tag")).ifPresent(value -> builder.tag(value)); - Optional.ofNullable(options.get("branch")).ifPresent(value -> builder.branch(value)); - Optional.ofNullable(options.get("start-tag")).ifPresent(value -> builder.startTag(value)); - Optional.ofNullable(options.get("end-tag")).ifPresent(value -> builder.endTag(value)); - Optional.ofNullable(options.get("start-snapshot-id")) - .ifPresent(value -> builder.startSnapshotId(Long.parseLong(value))); - Optional.ofNullable(options.get("end-snapshot-id")) - .ifPresent(value -> builder.endSnapshotId(Long.parseLong(value))); - Optional.ofNullable(options.get("as-of-timestamp")) - .ifPresent(value -> builder.asOfTimestamp(Long.parseLong(value))); - return run(builder, options, "", "*"); - } - - @Override - protected List run() throws Exception { - return run(FlinkSource.forRowData(), Maps.newHashMap(), "", "*"); - } - - protected abstract List run( - FlinkSource.Builder formatBuilder, - Map sqlOptions, - String sqlFilter, - String... sqlSelectedFields) - throws Exception; -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java deleted file mode 100644 index 14131d9e96d5..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.util.List; -import org.apache.flink.types.Row; -import org.apache.iceberg.flink.FlinkReadOptions; -import org.junit.jupiter.api.TestTemplate; - -public class TestFlinkSourceConfig extends TableSourceTestBase { - private static final String TABLE = "test_table"; - - @TestTemplate - public void testFlinkSessionConfig() { - getTableEnv().getConfig().set(FlinkReadOptions.STREAMING_OPTION, true); - assertThatThrownBy(() -> sql("SELECT * FROM %s /*+ OPTIONS('as-of-timestamp'='1')*/", TABLE)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot set as-of-timestamp option for streaming reader"); - } - - @TestTemplate - public void testFlinkHintConfig() { - List result = - sql( - "SELECT * FROM %s /*+ OPTIONS('as-of-timestamp'='%d','streaming'='false')*/", - TABLE, System.currentTimeMillis()); - assertThat(result).hasSize(3); - } - - @TestTemplate - public void testReadOptionHierarchy() { - getTableEnv().getConfig().set(FlinkReadOptions.LIMIT_OPTION, 1L); - List result = sql("SELECT * FROM %s", TABLE); - // Note that this query doesn't have the limit clause in the SQL. - // This assertions works because limit is pushed down to the reader and - // reader parallelism is 1. - assertThat(result).hasSize(1); - - result = sql("SELECT * FROM %s /*+ OPTIONS('limit'='3')*/", TABLE); - assertThat(result).hasSize(3); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java deleted file mode 100644 index 2dc5bc5c658e..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.List; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.PipelineOptions; -import org.apache.flink.table.api.config.TableConfigOptions; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Table; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -/** Use the FlinkSource */ -public class TestFlinkSourceSql extends TestSqlBase { - @BeforeEach - @Override - public void before() throws IOException { - SqlHelpers.sql( - getTableEnv(), - "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", - CATALOG_EXTENSION.warehouse()); - SqlHelpers.sql(getTableEnv(), "use catalog iceberg_catalog"); - getTableEnv() - .getConfig() - .getConfiguration() - .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); - } - - @Test - public void testInferParallelismWithGlobalSetting() throws IOException { - Configuration cfg = getTableEnv().getConfig().getConfiguration(); - cfg.set(PipelineOptions.MAX_PARALLELISM, 1); - - Table table = - CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, null); - - GenericAppenderHelper helper = - new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); - List expectedRecords = Lists.newArrayList(); - long maxFileLen = 0; - for (int i = 0; i < 5; i++) { - List records = RandomGenericData.generate(TestFixtures.SCHEMA, 2, i); - DataFile dataFile = helper.writeFile(null, records); - helper.appendToTable(dataFile); - expectedRecords.addAll(records); - maxFileLen = Math.max(dataFile.fileSizeInBytes(), maxFileLen); - } - - // Make sure to generate multiple CombinedScanTasks - SqlHelpers.sql( - getTableEnv(), - "ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", - maxFileLen); - - List results = run(Maps.newHashMap(), "", "*"); - org.apache.iceberg.flink.TestHelpers.assertRecords( - results, expectedRecords, TestFixtures.SCHEMA); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java deleted file mode 100644 index 18528c789114..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java +++ /dev/null @@ -1,561 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.util.List; -import org.apache.flink.table.api.SqlParserException; -import org.apache.flink.types.Row; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.TestTemplate; - -public class TestFlinkTableSource extends TableSourceTestBase { - - @TestTemplate - public void testLimitPushDown() { - - assertThatThrownBy(() -> sql("SELECT * FROM %s LIMIT -1", TABLE_NAME)) - .isInstanceOf(SqlParserException.class) - .hasMessageStartingWith("SQL parse failed."); - - assertThat(sql("SELECT * FROM %s LIMIT 0", TABLE_NAME)).isEmpty(); - - String sqlLimitExceed = String.format("SELECT * FROM %s LIMIT 4", TABLE_NAME); - List resultExceed = sql(sqlLimitExceed); - assertThat(resultExceed).hasSize(3); - List expectedList = - Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); - assertSameElements(expectedList, resultExceed); - - String querySql = String.format("SELECT * FROM %s LIMIT 1", TABLE_NAME); - String explain = getTableEnv().explainSql(querySql); - String expectedExplain = "limit=[1]"; - assertThat(explain).as("Explain should contain LimitPushDown").contains(expectedExplain); - List result = sql(querySql); - assertThat(result).hasSize(1); - assertThat(result).containsAnyElementsOf(expectedList); - - String sqlMixed = String.format("SELECT * FROM %s WHERE id = 1 LIMIT 2", TABLE_NAME); - List mixedResult = sql(sqlMixed); - assertThat(mixedResult).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - } - - @TestTemplate - public void testNoFilterPushDown() { - String sql = String.format("SELECT * FROM %s ", TABLE_NAME); - List result = sql(sql); - List expectedRecords = - Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); - assertSameElements(expectedRecords, result); - assertThat(lastScanEvent.filter()) - .as("Should not push down a filter") - .isEqualTo(Expressions.alwaysTrue()); - } - - @TestTemplate - public void testFilterPushDownEqual() { - String sqlLiteralRight = String.format("SELECT * FROM %s WHERE id = 1 ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") == 1"; - - List result = sql(sqlLiteralRight); - assertThat(result).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownEqualNull() { - String sqlEqualNull = String.format("SELECT * FROM %s WHERE data = NULL ", TABLE_NAME); - - List result = sql(sqlEqualNull); - assertThat(result).isEmpty(); - assertThat(lastScanEvent).as("Should not push down a filter").isNull(); - } - - @TestTemplate - public void testFilterPushDownEqualLiteralOnLeft() { - String sqlLiteralLeft = String.format("SELECT * FROM %s WHERE 1 = id ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") == 1"; - - List resultLeft = sql(sqlLiteralLeft); - assertThat(resultLeft).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownNoEqual() { - String sqlNE = String.format("SELECT * FROM %s WHERE id <> 1 ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") != 1"; - - List resultNE = sql(sqlNE); - assertThat(resultNE).hasSize(2); - - List expectedNE = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); - assertSameElements(expectedNE, resultNE); - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownNoEqualNull() { - String sqlNotEqualNull = String.format("SELECT * FROM %s WHERE data <> NULL ", TABLE_NAME); - - List resultNE = sql(sqlNotEqualNull); - assertThat(resultNE).isEmpty(); - assertThat(lastScanEvent).as("Should not push down a filter").isNull(); - } - - @TestTemplate - public void testFilterPushDownAnd() { - String sqlAnd = - String.format("SELECT * FROM %s WHERE id = 1 AND data = 'iceberg' ", TABLE_NAME); - - List resultAnd = sql(sqlAnd); - assertThat(resultAnd).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - assertThat(scanEventCount).isEqualTo(1); - String expected = "(ref(name=\"id\") == 1 and ref(name=\"data\") == \"iceberg\")"; - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expected); - } - - @TestTemplate - public void testFilterPushDownOr() { - String sqlOr = String.format("SELECT * FROM %s WHERE id = 1 OR data = 'b' ", TABLE_NAME); - String expectedFilter = "(ref(name=\"id\") == 1 or ref(name=\"data\") == \"b\")"; - - List resultOr = sql(sqlOr); - assertThat(resultOr).hasSize(2); - - List expectedOR = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expectedOR, resultOr); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownGreaterThan() { - String sqlGT = String.format("SELECT * FROM %s WHERE id > 1 ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") > 1"; - - List resultGT = sql(sqlGT); - assertThat(resultGT).hasSize(2); - - List expectedGT = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); - assertSameElements(expectedGT, resultGT); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownGreaterThanNull() { - String sqlGT = String.format("SELECT * FROM %s WHERE data > null ", TABLE_NAME); - - List resultGT = sql(sqlGT); - assertThat(resultGT).isEmpty(); - assertThat(lastScanEvent).as("Should not push down a filter").isNull(); - } - - @TestTemplate - public void testFilterPushDownGreaterThanLiteralOnLeft() { - String sqlGT = String.format("SELECT * FROM %s WHERE 3 > id ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") < 3"; - - List resultGT = sql(sqlGT); - assertThat(resultGT).hasSize(2); - - List expectedGT = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expectedGT, resultGT); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownGreaterThanEqual() { - String sqlGTE = String.format("SELECT * FROM %s WHERE id >= 2 ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") >= 2"; - - List resultGTE = sql(sqlGTE); - assertThat(resultGTE).hasSize(2); - - List expectedGTE = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); - assertSameElements(expectedGTE, resultGTE); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownGreaterThanEqualNull() { - String sqlGTE = String.format("SELECT * FROM %s WHERE data >= null ", TABLE_NAME); - - List resultGT = sql(sqlGTE); - assertThat(resultGT).isEmpty(); - assertThat(lastScanEvent).as("Should not push down a filter").isNull(); - } - - @TestTemplate - public void testFilterPushDownGreaterThanEqualLiteralOnLeft() { - String sqlGTE = String.format("SELECT * FROM %s WHERE 2 >= id ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") <= 2"; - - List resultGTE = sql(sqlGTE); - assertThat(resultGTE).hasSize(2); - - List expectedGTE = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expectedGTE, resultGTE); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownLessThan() { - String sqlLT = String.format("SELECT * FROM %s WHERE id < 2 ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") < 2"; - - List resultLT = sql(sqlLT); - assertThat(resultLT).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownLessThanNull() { - String sqlLT = String.format("SELECT * FROM %s WHERE data < null ", TABLE_NAME); - - List resultGT = sql(sqlLT); - assertThat(resultGT).isEmpty(); - assertThat(lastScanEvent).as("Should not push down a filter").isNull(); - } - - @TestTemplate - public void testFilterPushDownLessThanLiteralOnLeft() { - String sqlLT = String.format("SELECT * FROM %s WHERE 2 < id ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") > 2"; - - List resultLT = sql(sqlLT); - assertThat(resultLT).hasSize(1).first().isEqualTo(Row.of(3, null, 30.0)); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownLessThanEqual() { - String sqlLTE = String.format("SELECT * FROM %s WHERE id <= 1 ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") <= 1"; - - List resultLTE = sql(sqlLTE); - assertThat(resultLTE).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownLessThanEqualNull() { - String sqlLTE = String.format("SELECT * FROM %s WHERE data <= null ", TABLE_NAME); - - List resultGT = sql(sqlLTE); - assertThat(resultGT).isEmpty(); - assertThat(lastScanEvent).as("Should not push down a filter").isNull(); - } - - @TestTemplate - public void testFilterPushDownLessThanEqualLiteralOnLeft() { - String sqlLTE = String.format("SELECT * FROM %s WHERE 3 <= id ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") >= 3"; - - List resultLTE = sql(sqlLTE); - assertThat(resultLTE).hasSize(1).first().isEqualTo(Row.of(3, null, 30.0)); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownIn() { - String sqlIN = String.format("SELECT * FROM %s WHERE id IN (1,2) ", TABLE_NAME); - String expectedFilter = "(ref(name=\"id\") == 1 or ref(name=\"id\") == 2)"; - List resultIN = sql(sqlIN); - assertThat(resultIN).hasSize(2); - - List expectedIN = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expectedIN, resultIN); - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownInNull() { - String sqlInNull = - String.format("SELECT * FROM %s WHERE data IN ('iceberg',NULL) ", TABLE_NAME); - - List result = sql(sqlInNull); - assertThat(result).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - - // In SQL, null check can only be done as IS NULL or IS NOT NULL, so it's correct to ignore it - // and push the rest down. - String expectedScan = "ref(name=\"data\") == \"iceberg\""; - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedScan); - } - - @TestTemplate - public void testFilterPushDownNotIn() { - String sqlNotIn = String.format("SELECT * FROM %s WHERE id NOT IN (3,2) ", TABLE_NAME); - - List resultNotIn = sql(sqlNotIn); - assertThat(resultNotIn).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - assertThat(scanEventCount).isEqualTo(1); - String expectedScan = "(ref(name=\"id\") != 2 and ref(name=\"id\") != 3)"; - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedScan); - } - - @TestTemplate - public void testFilterPushDownNotInNull() { - String sqlNotInNull = String.format("SELECT * FROM %s WHERE id NOT IN (1,2,NULL) ", TABLE_NAME); - List resultGT = sql(sqlNotInNull); - assertThat(resultGT).isEmpty(); - assertThat(lastScanEvent) - .as( - "As the predicate pushdown filter out all rows, Flink did not create scan plan, so it doesn't publish any ScanEvent.") - .isNull(); - } - - @TestTemplate - public void testFilterPushDownIsNotNull() { - String sqlNotNull = String.format("SELECT * FROM %s WHERE data IS NOT NULL", TABLE_NAME); - String expectedFilter = "not_null(ref(name=\"data\"))"; - - List resultNotNull = sql(sqlNotNull); - assertThat(resultNotNull).hasSize(2); - - List expected = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expected, resultNotNull); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownIsNull() { - String sqlNull = String.format("SELECT * FROM %s WHERE data IS NULL", TABLE_NAME); - String expectedFilter = "is_null(ref(name=\"data\"))"; - - List resultNull = sql(sqlNull); - assertThat(resultNull).hasSize(1).first().isEqualTo(Row.of(3, null, 30.0)); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownNot() { - String sqlNot = String.format("SELECT * FROM %s WHERE NOT (id = 1 OR id = 2 ) ", TABLE_NAME); - - List resultNot = sql(sqlNot); - assertThat(resultNot).hasSize(1).first().isEqualTo(Row.of(3, null, 30.0)); - - assertThat(scanEventCount).isEqualTo(1); - String expectedFilter = "(ref(name=\"id\") != 1 and ref(name=\"id\") != 2)"; - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownBetween() { - String sqlBetween = String.format("SELECT * FROM %s WHERE id BETWEEN 1 AND 2 ", TABLE_NAME); - - List resultBetween = sql(sqlBetween); - assertThat(resultBetween).hasSize(2); - - List expectedBetween = - Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expectedBetween, resultBetween); - - assertThat(scanEventCount).isEqualTo(1); - String expected = "(ref(name=\"id\") >= 1 and ref(name=\"id\") <= 2)"; - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expected); - } - - @TestTemplate - public void testFilterPushDownNotBetween() { - String sqlNotBetween = - String.format("SELECT * FROM %s WHERE id NOT BETWEEN 2 AND 3 ", TABLE_NAME); - String expectedFilter = "(ref(name=\"id\") < 2 or ref(name=\"id\") > 3)"; - - List resultNotBetween = sql(sqlNotBetween); - assertThat(resultNotBetween).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownLike() { - String expectedFilter = "ref(name=\"data\") startsWith \"\"ice\"\""; - - String sqlLike = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'ice%%' "; - List resultLike = sql(sqlLike); - assertThat(resultLike).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - - // %% won't match the row with null value - sqlLike = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%' "; - resultLike = sql(sqlLike); - assertThat(resultLike).hasSize(2); - List expectedRecords = - Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expectedRecords, resultLike); - String expectedScan = "not_null(ref(name=\"data\"))"; - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedScan); - } - - @TestTemplate - public void testFilterNotPushDownLike() { - Row expectRecord = Row.of(1, "iceberg", 10.0); - String sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%i' "; - List resultLike = sql(sqlNoPushDown); - assertThat(resultLike).isEmpty(); - assertThat(lastScanEvent.filter()) - .as("Should not push down a filter") - .isEqualTo(Expressions.alwaysTrue()); - - sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%i%%' "; - resultLike = sql(sqlNoPushDown); - assertThat(resultLike).hasSize(1).first().isEqualTo(expectRecord); - assertThat(lastScanEvent.filter()) - .as("Should not push down a filter") - .isEqualTo(Expressions.alwaysTrue()); - - sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%ice%%g' "; - resultLike = sql(sqlNoPushDown); - assertThat(resultLike).hasSize(1).first().isEqualTo(expectRecord); - assertThat(lastScanEvent.filter()) - .as("Should not push down a filter") - .isEqualTo(Expressions.alwaysTrue()); - - sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'iceber_' "; - resultLike = sql(sqlNoPushDown); - assertThat(resultLike).hasSize(1).first().isEqualTo(expectRecord); - assertThat(lastScanEvent.filter()) - .as("Should not push down a filter") - .isEqualTo(Expressions.alwaysTrue()); - - sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'i%%g' "; - resultLike = sql(sqlNoPushDown); - assertThat(resultLike).hasSize(1).first().isEqualTo(expectRecord); - assertThat(lastScanEvent.filter()) - .as("Should not push down a filter") - .isEqualTo(Expressions.alwaysTrue()); - } - - @TestTemplate - public void testFilterPushDown2Literal() { - String sql2Literal = String.format("SELECT * FROM %s WHERE 1 > 0 ", TABLE_NAME); - List result = sql(sql2Literal); - List expectedRecords = - Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); - assertSameElements(expectedRecords, result); - assertThat(lastScanEvent.filter()) - .as("Should not push down a filter") - .isEqualTo(Expressions.alwaysTrue()); - } - - @TestTemplate - public void testSqlParseNaN() { - // todo add some test case to test NaN - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java deleted file mode 100644 index e3dccd76393e..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.flink.SimpleDataUtil.SCHEMA; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.stream.Collectors; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.data.RowDataToRowMapper; -import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.TestTemplate; - -public class TestIcebergSourceBounded extends TestFlinkScan { - @TestTemplate - public void testValidation() { - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, SCHEMA); - - assertThatThrownBy( - () -> - IcebergSource.forRowData() - .tableLoader(tableLoader()) - .assignerFactory(new SimpleSplitAssignerFactory()) - .streaming(false) - .endTag("tag") - .endSnapshotId(1L) - .build()) - .hasMessage("END_SNAPSHOT_ID and END_TAG cannot both be set.") - .isInstanceOf(IllegalArgumentException.class); - } - - @Override - protected List runWithProjection(String... projected) throws Exception { - // Convert Iceberg schema to Flink schema - Schema icebergTableSchema = - CATALOG_EXTENSION.catalog().loadTable(TestFixtures.TABLE_IDENTIFIER).schema(); - ResolvedSchema fullFlinkSchema = FlinkSchemaUtil.toResolvedSchema(icebergTableSchema); - - // Projection - List projectedColumns = - Arrays.stream(projected) - .map(fullFlinkSchema::getColumn) - .flatMap(Optional::stream) - .collect(Collectors.toList()); - - // Convert back to Iceberg schema - ResolvedSchema projectedFlinkSchema = ResolvedSchema.of(projectedColumns); - Schema projectedIcebergSchema = - FlinkSchemaUtil.convert(icebergTableSchema, projectedFlinkSchema); - return run(projectedIcebergSchema, Lists.newArrayList(), Maps.newHashMap(), "", projected); - } - - @Override - protected List runWithFilter(Expression filter, String sqlFilter, boolean caseSensitive) - throws Exception { - Map options = Maps.newHashMap(); - options.put("case-sensitive", Boolean.toString(caseSensitive)); - return run(null, Collections.singletonList(filter), options, sqlFilter, "*"); - } - - @Override - protected List runWithOptions(Map options) throws Exception { - return run(null, Lists.newArrayList(), options, "", "*"); - } - - @Override - protected List run() throws Exception { - return run(null, Lists.newArrayList(), Maps.newHashMap(), "", "*"); - } - - protected List run( - Schema projectedSchema, - List filters, - Map options, - String sqlFilter, - String... sqlSelectedFields) - throws Exception { - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - env.setParallelism(1); - Configuration config = new Configuration(); - config.setInteger(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 128); - Table table; - try (TableLoader tableLoader = tableLoader()) { - tableLoader.open(); - table = tableLoader.loadTable(); - } - - IcebergSource.Builder sourceBuilder = - IcebergSource.forRowData() - .tableLoader(tableLoader()) - .table(table) - .assignerFactory(new SimpleSplitAssignerFactory()) - .flinkConfig(config); - if (projectedSchema != null) { - sourceBuilder.project(projectedSchema); - } - - sourceBuilder.filters(filters); - sourceBuilder.setAll(options); - - DataStream stream = - sourceBuilder - .buildStream(env) - .map( - new RowDataToRowMapper( - FlinkSchemaUtil.convert( - projectedSchema == null ? table.schema() : projectedSchema))); - - try (CloseableIterator iter = stream.executeAndCollect()) { - return Lists.newArrayList(iter); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedConverterBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedConverterBase.java deleted file mode 100644 index eb4ad4e0402d..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedConverterBase.java +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; -import org.apache.iceberg.flink.source.reader.ReaderFunction; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.TypeUtil; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public abstract class TestIcebergSourceBoundedConverterBase { - @TempDir protected Path temporaryFolder; - - @RegisterExtension - static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); - - @Parameters(name = "format={0}, parallelism = {1}, useConverter = {2}") - public static Object[][] parameters() { - return new Object[][] { - {FileFormat.AVRO, 2, true}, - {FileFormat.PARQUET, 2, true}, - {FileFormat.ORC, 2, true} - }; - } - - @Parameter(index = 0) - FileFormat fileFormat; - - @Parameter(index = 1) - int parallelism; - - @Parameter(index = 2) - boolean useConverter; - - @TestTemplate - public void testUnpartitionedTable() throws Exception { - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); - new GenericAppenderHelper(table, fileFormat, temporaryFolder).appendToTable(expectedRecords); - TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); - } - - @TestTemplate - public void testPartitionedTable() throws Exception { - String dateStr = "2020-03-20"; - Table table = getPartitionedTable(); - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); - for (Record expectedRecord : expectedRecords) { - expectedRecord.setField("dt", dateStr); - } - addRecordsToPartitionedTable(table, dateStr, expectedRecords); - TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); - } - - @TestTemplate - public void testProjection() throws Exception { - Table table = getPartitionedTable(); - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); - addRecordsToPartitionedTable(table, "2020-03-20", expectedRecords); - // select the "data" field (fieldId == 1) - Schema projectedSchema = TypeUtil.select(TestFixtures.SCHEMA, Sets.newHashSet(1)); - List expectedRows = - Arrays.asList(Row.of(expectedRecords.get(0).get(0)), Row.of(expectedRecords.get(1).get(0))); - TestHelpers.assertRows( - run(projectedSchema, Collections.emptyList(), Collections.emptyMap()), expectedRows); - } - - static Table getPartitionedTable() { - return CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); - } - - static TableLoader tableLoader() { - return CATALOG_EXTENSION.tableLoader(); - } - - private void addRecordsToPartitionedTable( - Table table, String dateStr, List expectedRecords) throws IOException { - new GenericAppenderHelper(table, fileFormat, temporaryFolder) - .appendToTable(org.apache.iceberg.TestHelpers.Row.of(dateStr, 0), expectedRecords); - } - - private List run() throws Exception { - return run(null, Collections.emptyList(), Collections.emptyMap()); - } - - private List run( - Schema projectedSchema, List filters, Map options) - throws Exception { - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - env.setParallelism(parallelism); - env.getConfig().enableObjectReuse(); - - Configuration config = new Configuration(); - config.setInteger(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 128); - Table table; - try (TableLoader tableLoader = tableLoader()) { - tableLoader.open(); - table = tableLoader.loadTable(); - } - - Schema readSchema = projectedSchema != null ? projectedSchema : table.schema(); - IcebergSource.Builder sourceBuilder = - getSourceBuilder(projectedSchema, filters, readSchema, config, table); - - if (projectedSchema != null) { - sourceBuilder.project(projectedSchema); - } - - sourceBuilder.filters(filters); - sourceBuilder.setAll(options); - - DataStream inputStream = - env.fromSource( - sourceBuilder.build(), - WatermarkStrategy.noWatermarks(), - "testBasicRead", - getTypeInfo(readSchema)); - - DataStream stream = mapToRow(inputStream, readSchema); - - try (CloseableIterator iter = stream.executeAndCollect()) { - return Lists.newArrayList(iter); - } - } - - private IcebergSource.Builder getSourceBuilder( - Schema projectedSchema, - List filters, - Schema readSchema, - Configuration config, - Table table) - throws Exception { - if (useConverter) { - return createSourceBuilderWithConverter(readSchema, config, table); - } - return createSourceBuilderWithReaderFunction(table, projectedSchema, filters, config); - } - - private IcebergSource.Builder createSourceBuilderWithConverter( - Schema readSchema, Configuration config, Table table) throws Exception { - return IcebergSource.forOutputType(getConverter(readSchema, table)) - .tableLoader(tableLoader()) - .assignerFactory(new SimpleSplitAssignerFactory()) - .flinkConfig(config); - } - - private IcebergSource.Builder createSourceBuilderWithReaderFunction( - Table table, Schema projected, List filters, Configuration config) - throws Exception { - return IcebergSource.builder() - .tableLoader(tableLoader()) - .readerFunction(getReaderFunction(projected, table, filters)) - .assignerFactory(new SimpleSplitAssignerFactory()) - .flinkConfig(config); - } - - protected abstract org.apache.iceberg.flink.source.reader.RowDataConverter getConverter( - Schema icebergSchema, Table table) throws Exception; - - protected ReaderFunction getReaderFunction( - Schema icebergSchema, Table table, List filters) throws Exception { - throw new UnsupportedOperationException("No default implementation for getReaderFunction"); - } - - protected abstract TypeInformation getTypeInfo(Schema icebergSchema); - - protected abstract DataStream mapToRow(DataStream inputStream, Schema icebergSchema); -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java deleted file mode 100644 index faddce542285..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.List; -import org.apache.avro.generic.GenericRecord; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.formats.avro.typeutils.GenericRecordAvroTypeInfo; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.Row; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.avro.AvroSchemaUtil; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.data.RowDataToRowMapper; -import org.apache.iceberg.flink.sink.AvroGenericRecordToRowDataMapper; -import org.apache.iceberg.flink.source.reader.AvroGenericRecordConverter; -import org.apache.iceberg.flink.source.reader.AvroGenericRecordReaderFunction; -import org.apache.iceberg.flink.source.reader.ReaderFunction; -import org.apache.iceberg.flink.source.reader.RowDataConverter; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestIcebergSourceBoundedGenericRecord - extends TestIcebergSourceBoundedConverterBase { - - @Parameters(name = "format={0}, parallelism = {1}, useConverter = {2}") - public static Object[][] parameters() { - return new Object[][] { - {FileFormat.AVRO, 2, true}, - {FileFormat.PARQUET, 2, true}, - {FileFormat.PARQUET, 2, false}, - {FileFormat.ORC, 2, true} - }; - } - - @Override - protected RowDataConverter getConverter(Schema icebergSchema, Table table) { - return AvroGenericRecordConverter.fromIcebergSchema(icebergSchema, table.name()); - } - - @Override - protected ReaderFunction getReaderFunction( - Schema icebergSchema, Table table, List filters) throws Exception { - return new AvroGenericRecordReaderFunction( - TestFixtures.TABLE_IDENTIFIER.name(), - new Configuration(), - table.schema(), - icebergSchema, - null, - false, - table.io(), - table.encryption(), - filters); - } - - @Override - protected TypeInformation getTypeInfo(Schema icebergSchema) { - org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, TestFixtures.TABLE_IDENTIFIER.name()); - return new GenericRecordAvroTypeInfo(avroSchema); - } - - @Override - protected DataStream mapToRow(DataStream inputStream, Schema icebergSchema) { - RowType rowType = FlinkSchemaUtil.convert(icebergSchema); - org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, TestFixtures.TABLE_IDENTIFIER.name()); - return inputStream - .map(AvroGenericRecordToRowDataMapper.forAvroSchema(avroSchema)) - .map(new RowDataToRowMapper(rowType)); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedRow.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedRow.java deleted file mode 100644 index 13087bc0a06a..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedRow.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.java.typeutils.RowTypeInfo; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.runtime.typeutils.ExternalTypeInfo; -import org.apache.flink.types.Row; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.source.reader.RowConverter; -import org.apache.iceberg.flink.source.reader.RowDataConverter; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestIcebergSourceBoundedRow extends TestIcebergSourceBoundedConverterBase { - - @Override - protected RowDataConverter getConverter(Schema icebergSchema, Table table) { - return RowConverter.fromIcebergSchema(icebergSchema); - } - - @Override - protected TypeInformation getTypeInfo(Schema icebergSchema) { - ResolvedSchema resolvedSchema = FlinkSchemaUtil.toResolvedSchema(icebergSchema); - TypeInformation[] types = - resolvedSchema.getColumnDataTypes().stream() - .map(ExternalTypeInfo::of) - .toArray(TypeInformation[]::new); - String[] fieldNames = resolvedSchema.getColumnNames().toArray(String[]::new); - return new RowTypeInfo(types, fieldNames); - } - - @Override - protected DataStream mapToRow(DataStream inputStream, Schema icebergSchema) { - return inputStream; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java deleted file mode 100644 index d3713e296014..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.config.TableConfigOptions; -import org.apache.flink.types.Row; -import org.apache.iceberg.Schema; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.junit.jupiter.api.BeforeEach; - -public class TestIcebergSourceBoundedSql extends TestIcebergSourceBounded { - private volatile TableEnvironment tEnv; - - @BeforeEach - public void before() throws IOException { - Configuration tableConf = getTableEnv().getConfig().getConfiguration(); - tableConf.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE, true); - SqlHelpers.sql( - getTableEnv(), - "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", - CATALOG_EXTENSION.warehouse()); - SqlHelpers.sql(getTableEnv(), "use catalog iceberg_catalog"); - getTableEnv() - .getConfig() - .getConfiguration() - .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); - } - - private TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - this.tEnv = - TableEnvironment.create(EnvironmentSettings.newInstance().inBatchMode().build()); - } - } - } - return tEnv; - } - - @Override - protected List run( - Schema projectedSchema, - List filters, - Map options, - String sqlFilter, - String... sqlSelectedFields) - throws Exception { - String select = String.join(",", sqlSelectedFields); - String optionStr = SqlHelpers.sqlOptionsToString(options); - return SqlHelpers.sql(getTableEnv(), "select %s from t %s %s", select, optionStr, sqlFilter); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java deleted file mode 100644 index 749cbf89338a..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java +++ /dev/null @@ -1,573 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.nio.file.Path; -import java.time.Duration; -import java.util.Collection; -import java.util.List; -import java.util.Optional; -import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.Collectors; -import org.apache.flink.api.common.JobID; -import org.apache.flink.api.common.JobStatus; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.client.program.ClusterClient; -import org.apache.flink.metrics.MetricGroup; -import org.apache.flink.runtime.client.JobStatusMessage; -import org.apache.flink.runtime.testutils.InMemoryReporter; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.data.RowData; -import org.apache.flink.test.junit5.InjectClusterClient; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.HadoopTableExtension; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.data.RowDataToRowMapper; -import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -public class TestIcebergSourceContinuous { - - public static final InMemoryReporter METRIC_REPORTER = InMemoryReporter.create(); - - @TempDir protected Path temporaryFolder; - - @RegisterExtension - public static MiniClusterExtension miniClusterExtension = - MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(METRIC_REPORTER); - - @RegisterExtension - private static final HadoopTableExtension TABLE_EXTENSION = - new HadoopTableExtension(TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); - - private final AtomicLong randomSeed = new AtomicLong(0L); - - @Test - public void testTableScanThenIncremental() throws Exception { - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); - - // snapshot1 - List batch1 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch1); - - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - - try (CloseableIterator iter = - createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { - List result1 = waitForResult(iter, 2); - TestHelpers.assertRecords(result1, batch1, TABLE_EXTENSION.table().schema()); - - // snapshot2 - List batch2 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch2); - TABLE_EXTENSION.table().currentSnapshot().snapshotId(); - - List result2 = waitForResult(iter, 2); - TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); - - // snapshot3 - List batch3 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch3); - TABLE_EXTENSION.table().currentSnapshot().snapshotId(); - - List result3 = waitForResult(iter, 2); - TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); - - assertThatIcebergEnumeratorMetricsExist(); - } - } - - @Test - public void testTableScanThenIncrementalAfterExpiration() throws Exception { - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); - - // snapshot1 - List batch1 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch1); - long snapshotId = TABLE_EXTENSION.table().currentSnapshot().snapshotId(); - - // snapshot2 - List batch2 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch2); - - TABLE_EXTENSION.table().expireSnapshots().expireSnapshotId(snapshotId).commit(); - - assertThat(TABLE_EXTENSION.table().history()).hasSize(1); - - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - - assertThat(FlinkSplitPlanner.checkScanMode(scanContext)) - .isEqualTo(FlinkSplitPlanner.ScanMode.BATCH); - - try (CloseableIterator iter = - createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { - List result1 = waitForResult(iter, 4); - List initialRecords = Lists.newArrayList(); - initialRecords.addAll(batch1); - initialRecords.addAll(batch2); - TestHelpers.assertRecords(result1, initialRecords, TABLE_EXTENSION.table().schema()); - - // snapshot3 - List batch3 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch3); - TABLE_EXTENSION.table().currentSnapshot().snapshotId(); - - List result3 = waitForResult(iter, 2); - TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); - - assertThatIcebergEnumeratorMetricsExist(); - } - } - - @Test - public void testEarliestSnapshot() throws Exception { - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); - - // snapshot0 - List batch0 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch0); - - // snapshot1 - List batch1 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch1); - - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .build(); - - try (CloseableIterator iter = - createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { - List result1 = waitForResult(iter, 4); - List combinedBatch0AndBatch1 = Lists.newArrayList(batch0); - combinedBatch0AndBatch1.addAll(batch1); - TestHelpers.assertRecords(result1, combinedBatch0AndBatch1, TABLE_EXTENSION.table().schema()); - - // snapshot2 - List batch2 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch2); - - List result2 = waitForResult(iter, 2); - TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); - - // snapshot3 - List batch3 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch3); - - List result3 = waitForResult(iter, 2); - TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); - - assertThatIcebergEnumeratorMetricsExist(); - } - } - - @Test - public void testLatestSnapshot(@InjectClusterClient ClusterClient clusterClient) - throws Exception { - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); - - // snapshot0 - List batch0 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch0); - - // snapshot1 - List batch1 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch1); - - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) - .build(); - - try (CloseableIterator iter = - createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { - // we want to make sure job is running first so that enumerator can - // start from the latest snapshot before inserting the next batch2 below. - waitUntilJobIsRunning(clusterClient); - - // inclusive behavior for starting snapshot - List result1 = waitForResult(iter, 2); - TestHelpers.assertRecords(result1, batch1, TABLE_EXTENSION.table().schema()); - - // snapshot2 - List batch2 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch2); - - List result2 = waitForResult(iter, 2); - TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); - - // snapshot3 - List batch3 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch3); - - List result3 = waitForResult(iter, 2); - TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); - - assertThatIcebergEnumeratorMetricsExist(); - } - } - - @Test - public void testSpecificSnapshotId() throws Exception { - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); - - // snapshot0 - List batch0 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch0); - long snapshot0 = TABLE_EXTENSION.table().currentSnapshot().snapshotId(); - - // snapshot1 - List batch1 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch1); - long snapshot1 = TABLE_EXTENSION.table().currentSnapshot().snapshotId(); - - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(snapshot1) - .build(); - - try (CloseableIterator iter = - createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { - List result1 = waitForResult(iter, 2); - TestHelpers.assertRecords(result1, batch1, TABLE_EXTENSION.table().schema()); - - // snapshot2 - List batch2 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch2); - - List result2 = waitForResult(iter, 2); - TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); - - // snapshot3 - List batch3 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch3); - - List result3 = waitForResult(iter, 2); - TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); - - assertThatIcebergEnumeratorMetricsExist(); - } - } - - @Test - public void testSpecificSnapshotTimestamp() throws Exception { - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); - - // snapshot0 - List batch0 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch0); - long snapshot0Timestamp = TABLE_EXTENSION.table().currentSnapshot().timestampMillis(); - - // sleep for 2 ms to make sure snapshot1 has a higher timestamp value - Thread.sleep(2); - - // snapshot1 - List batch1 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch1); - long snapshot1Timestamp = TABLE_EXTENSION.table().currentSnapshot().timestampMillis(); - - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(snapshot1Timestamp) - .build(); - - try (CloseableIterator iter = - createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { - // consume data from snapshot1 - List result1 = waitForResult(iter, 2); - TestHelpers.assertRecords(result1, batch1, TABLE_EXTENSION.table().schema()); - - // snapshot2 - List batch2 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch2); - - List result2 = waitForResult(iter, 2); - TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); - - // snapshot3 - List batch3 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch3); - - List result3 = waitForResult(iter, 2); - TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); - - assertThatIcebergEnumeratorMetricsExist(); - } - } - - @Test - public void testReadingFromBranch() throws Exception { - String branch = "b1"; - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); - - List batchBase = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batchBase); - - // create branch - TABLE_EXTENSION - .table() - .manageSnapshots() - .createBranch(branch, TABLE_EXTENSION.table().currentSnapshot().snapshotId()) - .commit(); - - // snapshot1 to branch - List batch1 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(branch, batch1); - - // snapshot2 to branch - List batch2 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(branch, batch2); - - List branchExpectedRecords = Lists.newArrayList(); - branchExpectedRecords.addAll(batchBase); - branchExpectedRecords.addAll(batch1); - branchExpectedRecords.addAll(batch2); - // reads from branch: it should contain the first snapshot (before the branch creation) followed - // by the next 2 snapshots added - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .useBranch(branch) - .build(); - - try (CloseableIterator iter = - createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { - List resultMain = waitForResult(iter, 6); - TestHelpers.assertRecords( - resultMain, branchExpectedRecords, TABLE_EXTENSION.table().schema()); - - // snapshot3 to branch - List batch3 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(branch, batch3); - - List result3 = waitForResult(iter, 2); - TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); - - // snapshot4 to branch - List batch4 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(branch, batch4); - - List result4 = waitForResult(iter, 2); - TestHelpers.assertRecords(result4, batch4, TABLE_EXTENSION.table().schema()); - } - - // read only from main branch. Should contain only the first snapshot - scanContext = - ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - try (CloseableIterator iter = - createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { - List resultMain = waitForResult(iter, 2); - TestHelpers.assertRecords(resultMain, batchBase, TABLE_EXTENSION.table().schema()); - - List batchMain2 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batchMain2); - resultMain = waitForResult(iter, 2); - TestHelpers.assertRecords(resultMain, batchMain2, TABLE_EXTENSION.table().schema()); - } - } - - @Test - public void testValidation() { - assertThatThrownBy( - () -> - IcebergSource.forRowData() - .tableLoader(TABLE_EXTENSION.tableLoader()) - .assignerFactory(new SimpleSplitAssignerFactory()) - .streaming(true) - .endTag("tag") - .build()) - .hasMessage("Cannot set end-tag option for streaming reader") - .isInstanceOf(IllegalArgumentException.class); - } - - private DataStream createStream(ScanContext scanContext) throws Exception { - // start the source and collect output - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - env.setParallelism(1); - DataStream stream = - env.fromSource( - IcebergSource.forRowData() - .tableLoader(TABLE_EXTENSION.tableLoader()) - .assignerFactory(new SimpleSplitAssignerFactory()) - .streaming(scanContext.isStreaming()) - .streamingStartingStrategy(scanContext.streamingStartingStrategy()) - .startSnapshotTimestamp(scanContext.startSnapshotTimestamp()) - .startSnapshotId(scanContext.startSnapshotId()) - .monitorInterval(Duration.ofMillis(10L)) - .branch(scanContext.branch()) - .build(), - WatermarkStrategy.noWatermarks(), - "icebergSource", - TypeInformation.of(RowData.class)) - .map(new RowDataToRowMapper(FlinkSchemaUtil.convert(TABLE_EXTENSION.table().schema()))); - return stream; - } - - public static List waitForResult(CloseableIterator iter, int limit) { - List results = Lists.newArrayListWithCapacity(limit); - while (results.size() < limit) { - if (iter.hasNext()) { - results.add(iter.next()); - } else { - break; - } - } - return results; - } - - public static void waitUntilJobIsRunning(ClusterClient client) { - Awaitility.await("job should be running") - .atMost(Duration.ofSeconds(30)) - .pollInterval(Duration.ofMillis(10)) - .untilAsserted(() -> assertThat(getRunningJobs(client)).isNotEmpty()); - } - - public static List getRunningJobs(ClusterClient client) throws Exception { - Collection statusMessages = client.listJobs().get(); - return statusMessages.stream() - .filter(status -> status.getJobState() == JobStatus.RUNNING) - .map(JobStatusMessage::getJobId) - .collect(Collectors.toList()); - } - - private static void assertThatIcebergEnumeratorMetricsExist() { - assertThatIcebergSourceMetricExists( - "enumerator", "coordinator.enumerator.elapsedSecondsSinceLastSplitDiscovery"); - assertThatIcebergSourceMetricExists("enumerator", "coordinator.enumerator.unassignedSplits"); - assertThatIcebergSourceMetricExists("enumerator", "coordinator.enumerator.pendingRecords"); - } - - private static void assertThatIcebergSourceMetricExists( - String metricGroupPattern, String metricName) { - Optional groups = METRIC_REPORTER.findGroup(metricGroupPattern); - assertThat(groups).isPresent(); - assertThat( - METRIC_REPORTER.getMetricsByGroup(groups.get()).keySet().stream() - .map(name -> groups.get().getMetricIdentifier(name))) - .satisfiesOnlyOnce( - fullMetricName -> assertThat(fullMetricName).containsSubsequence(metricName)); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java deleted file mode 100644 index 0aea5b0b0a9b..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java +++ /dev/null @@ -1,392 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.flink.SimpleDataUtil.tableRecords; -import static org.assertj.core.api.Assertions.assertThat; - -import java.nio.file.Path; -import java.time.Duration; -import java.util.List; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.atomic.AtomicInteger; -import org.apache.flink.api.common.JobID; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.common.restartstrategy.RestartStrategies; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.client.program.ClusterClient; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.core.execution.JobClient; -import org.apache.flink.core.execution.SavepointFormatType; -import org.apache.flink.runtime.highavailability.nonha.embedded.HaLeadershipControl; -import org.apache.flink.runtime.jobgraph.SavepointConfigOptions; -import org.apache.flink.runtime.minicluster.MiniCluster; -import org.apache.flink.runtime.minicluster.RpcServiceSharing; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.data.RowData; -import org.apache.flink.test.junit5.InjectClusterClient; -import org.apache.flink.test.junit5.InjectMiniCluster; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.FlinkReadOptions; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.sink.FlinkSink; -import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.Timeout; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -@Timeout(value = 120) -public class TestIcebergSourceFailover { - - // Parallelism higher than 1, but lower than the number of splits used by some of our tests - // The goal is to allow some splits to remain in the enumerator when restoring the state - private static final int PARALLELISM = 2; - private static final int DO_NOT_FAIL = Integer.MAX_VALUE; - protected static final MiniClusterResourceConfiguration MINI_CLUSTER_RESOURCE_CONFIG = - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(1) - .setNumberSlotsPerTaskManager(PARALLELISM) - .setRpcServiceSharing(RpcServiceSharing.DEDICATED) - .withHaLeadershipControl() - .build(); - - @RegisterExtension - public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = - new MiniClusterExtension(MINI_CLUSTER_RESOURCE_CONFIG); - - @TempDir protected Path temporaryFolder; - - @RegisterExtension - protected static final HadoopCatalogExtension SOURCE_CATALOG_EXTENSION = - new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); - - @RegisterExtension - protected static final HadoopCatalogExtension SINK_CATALOG_EXTENSION = - new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.SINK_TABLE); - - protected Table sourceTable; - protected Table sinkTable; - - @BeforeEach - protected void setupTable() { - this.sourceTable = - SOURCE_CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - this.sinkTable = - SINK_CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.SINK_TABLE_IDENTIFIER, TestFixtures.SCHEMA); - } - - @BeforeEach - protected void startMiniCluster(@InjectMiniCluster MiniCluster miniCluster) throws Exception { - if (!miniCluster.isRunning()) { - miniCluster.start(); - } - } - - @AfterEach - protected void stopMiniCluster(@InjectMiniCluster MiniCluster miniCluster) throws Exception { - miniCluster.close(); - } - - protected IcebergSource.Builder sourceBuilder() { - Configuration config = new Configuration(); - return IcebergSource.forRowData() - .tableLoader(SOURCE_CATALOG_EXTENSION.tableLoader()) - .assignerFactory(new SimpleSplitAssignerFactory()) - // Prevent combining splits - .set( - FlinkReadOptions.SPLIT_FILE_OPEN_COST, - Long.toString(TableProperties.SPLIT_SIZE_DEFAULT)) - .flinkConfig(config); - } - - protected Schema schema() { - return TestFixtures.SCHEMA; - } - - protected List generateRecords(int numRecords, long seed) { - return RandomGenericData.generate(schema(), numRecords, seed); - } - - protected void assertRecords(Table table, List expectedRecords, Duration timeout) - throws Exception { - SimpleDataUtil.assertTableRecords(table, expectedRecords, timeout); - } - - @Disabled("Disabled for now as it is flaky on CI") - @Test - public void testBoundedWithSavepoint(@InjectClusterClient ClusterClient clusterClient) - throws Exception { - List expectedRecords = Lists.newArrayList(); - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(sourceTable, FileFormat.PARQUET, temporaryFolder); - for (int i = 0; i < 4; ++i) { - List records = generateRecords(2, i); - expectedRecords.addAll(records); - dataAppender.appendToTable(records); - } - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - createBoundedStreams(env, 2); - - JobClient jobClient = env.executeAsync("Bounded Iceberg Source Savepoint Test"); - JobID jobId = jobClient.getJobID(); - - // Write something, but do not finish before checkpoint is created - RecordCounterToWait.waitForCondition(); - CompletableFuture savepoint = - clusterClient.stopWithSavepoint( - jobId, false, temporaryFolder.toString(), SavepointFormatType.CANONICAL); - RecordCounterToWait.continueProcessing(); - - // Wait for the job to stop with the savepoint - String savepointPath = savepoint.get(); - - // We expect that at least a few records has written - assertThat(tableRecords(sinkTable)).hasSizeGreaterThan(0); - - // New env from the savepoint - Configuration conf = new Configuration(); - conf.set(SavepointConfigOptions.SAVEPOINT_PATH, savepointPath); - env = StreamExecutionEnvironment.getExecutionEnvironment(conf); - createBoundedStreams(env, DO_NOT_FAIL); - - env.execute("Bounded Iceberg Source Savepoint Test"); - - // We expect no duplications - assertRecords(sinkTable, expectedRecords, Duration.ofSeconds(120)); - } - - @Test - public void testBoundedWithTaskManagerFailover(@InjectMiniCluster MiniCluster miniCluster) - throws Exception { - testBoundedIcebergSource(FailoverType.TM, miniCluster); - } - - @Test - public void testBoundedWithJobManagerFailover(@InjectMiniCluster MiniCluster miniCluster) - throws Exception { - testBoundedIcebergSource(FailoverType.JM, miniCluster); - } - - private void testBoundedIcebergSource(FailoverType failoverType, MiniCluster miniCluster) - throws Exception { - List expectedRecords = Lists.newArrayList(); - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(sourceTable, FileFormat.PARQUET, temporaryFolder); - for (int i = 0; i < 4; ++i) { - List records = generateRecords(2, i); - expectedRecords.addAll(records); - dataAppender.appendToTable(records); - } - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); - createBoundedStreams(env, 2); - - JobClient jobClient = env.executeAsync("Bounded Iceberg Source Failover Test"); - JobID jobId = jobClient.getJobID(); - - RecordCounterToWait.waitForCondition(); - triggerFailover(failoverType, jobId, RecordCounterToWait::continueProcessing, miniCluster); - - assertRecords(sinkTable, expectedRecords, Duration.ofSeconds(120)); - } - - @Test - public void testContinuousWithTaskManagerFailover(@InjectMiniCluster MiniCluster miniCluster) - throws Exception { - testContinuousIcebergSource(FailoverType.TM, miniCluster); - } - - @Test - public void testContinuousWithJobManagerFailover(@InjectMiniCluster MiniCluster miniCluster) - throws Exception { - testContinuousIcebergSource(FailoverType.JM, miniCluster); - } - - private void testContinuousIcebergSource(FailoverType failoverType, MiniCluster miniCluster) - throws Exception { - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(sourceTable, FileFormat.PARQUET, temporaryFolder); - List expectedRecords = Lists.newArrayList(); - - List batch = generateRecords(2, 0); - expectedRecords.addAll(batch); - dataAppender.appendToTable(batch); - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - env.setParallelism(PARALLELISM); - env.enableCheckpointing(10L); - Configuration config = new Configuration(); - config.setInteger(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 128); - - DataStream stream = - env.fromSource( - sourceBuilder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10)) - .streamingStartingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(), - WatermarkStrategy.noWatermarks(), - "IcebergSource", - TypeInformation.of(RowData.class)); - - // CollectStreamSink from DataStream#executeAndCollect() doesn't guarantee - // exactly-once behavior. When Iceberg sink, we can verify end-to-end - // exactly-once. Here we mainly about source exactly-once behavior. - FlinkSink.forRowData(stream) - .table(sinkTable) - .tableLoader(SINK_CATALOG_EXTENSION.tableLoader()) - .append(); - - JobClient jobClient = env.executeAsync("Continuous Iceberg Source Failover Test"); - JobID jobId = jobClient.getJobID(); - - for (int i = 1; i < 5; i++) { - Thread.sleep(10); - List records = generateRecords(2, i); - expectedRecords.addAll(records); - dataAppender.appendToTable(records); - if (i == 2) { - triggerFailover(failoverType, jobId, () -> {}, miniCluster); - } - } - - // wait longer for continuous source to reduce flakiness - // because CI servers tend to be overloaded. - assertRecords(sinkTable, expectedRecords, Duration.ofSeconds(120)); - } - - private void createBoundedStreams(StreamExecutionEnvironment env, int failAfter) { - env.setParallelism(PARALLELISM); - - DataStream stream = - env.fromSource( - sourceBuilder().build(), - WatermarkStrategy.noWatermarks(), - "IcebergSource", - TypeInformation.of(RowData.class)); - - DataStream streamFailingInTheMiddleOfReading = - RecordCounterToWait.wrapWithFailureAfter(stream, failAfter); - - // CollectStreamSink from DataStream#executeAndCollect() doesn't guarantee - // exactly-once behavior. When Iceberg sink, we can verify end-to-end - // exactly-once. Here we mainly about source exactly-once behavior. - FlinkSink.forRowData(streamFailingInTheMiddleOfReading) - .table(sinkTable) - .tableLoader(SINK_CATALOG_EXTENSION.tableLoader()) - .append(); - } - - // ------------------------------------------------------------------------ - // test utilities copied from Flink's FileSourceTextLinesITCase - // ------------------------------------------------------------------------ - - private enum FailoverType { - NONE, - TM, - JM - } - - private static void triggerFailover( - FailoverType type, JobID jobId, Runnable afterFailAction, MiniCluster miniCluster) - throws Exception { - switch (type) { - case NONE: - afterFailAction.run(); - break; - case TM: - restartTaskManager(afterFailAction, miniCluster); - break; - case JM: - triggerJobManagerFailover(jobId, afterFailAction, miniCluster); - break; - } - } - - private static void triggerJobManagerFailover( - JobID jobId, Runnable afterFailAction, MiniCluster miniCluster) throws Exception { - HaLeadershipControl haLeadershipControl = miniCluster.getHaLeadershipControl().get(); - haLeadershipControl.revokeJobMasterLeadership(jobId).get(); - afterFailAction.run(); - haLeadershipControl.grantJobMasterLeadership(jobId).get(); - } - - private static void restartTaskManager(Runnable afterFailAction, MiniCluster miniCluster) - throws Exception { - miniCluster.terminateTaskManager(0).get(); - afterFailAction.run(); - miniCluster.startTaskManager(); - } - - private static class RecordCounterToWait { - - private static AtomicInteger records; - private static CountDownLatch countDownLatch; - private static CompletableFuture continueProcessing; - - private static DataStream wrapWithFailureAfter(DataStream stream, int condition) { - - records = new AtomicInteger(); - continueProcessing = new CompletableFuture<>(); - countDownLatch = new CountDownLatch(stream.getParallelism()); - return stream.map( - record -> { - boolean reachedFailPoint = records.incrementAndGet() > condition; - boolean notFailedYet = countDownLatch.getCount() != 0; - if (notFailedYet && reachedFailPoint) { - countDownLatch.countDown(); - continueProcessing.get(); - } - return record; - }); - } - - private static void waitForCondition() throws InterruptedException { - countDownLatch.await(); - } - - private static void continueProcessing() { - continueProcessing.complete(null); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java deleted file mode 100644 index 4f61d2f7308a..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.time.Duration; -import java.time.Instant; -import java.time.LocalDateTime; -import java.time.ZoneId; -import java.time.ZoneOffset; -import java.util.List; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.Collectors; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkReadOptions; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.types.Comparators; -import org.apache.iceberg.util.StructLikeWrapper; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.BeforeEach; - -public class TestIcebergSourceFailoverWithWatermarkExtractor extends TestIcebergSourceFailover { - // Increment ts by 15 minutes for each generateRecords batch - private static final long RECORD_BATCH_TS_INCREMENT_MILLI = TimeUnit.MINUTES.toMillis(15); - // Within a batch, increment ts by 1 second - private static final long RECORD_TS_INCREMENT_MILLI = TimeUnit.SECONDS.toMillis(1); - - private final AtomicLong tsMilli = new AtomicLong(System.currentTimeMillis()); - - @Override - @BeforeEach - protected void setupTable() { - this.sourceTable = - SOURCE_CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.TS_SCHEMA); - this.sinkTable = - SINK_CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.SINK_TABLE_IDENTIFIER, TestFixtures.TS_SCHEMA); - } - - @Override - protected IcebergSource.Builder sourceBuilder() { - Configuration config = new Configuration(); - return IcebergSource.forRowData() - .tableLoader(SOURCE_CATALOG_EXTENSION.tableLoader()) - .watermarkColumn("ts") - .project(TestFixtures.TS_SCHEMA) - // Prevent combining splits - .set( - FlinkReadOptions.SPLIT_FILE_OPEN_COST, - Long.toString(TableProperties.SPLIT_SIZE_DEFAULT)) - .flinkConfig(config); - } - - @Override - protected Schema schema() { - return TestFixtures.TS_SCHEMA; - } - - @Override - protected List generateRecords(int numRecords, long seed) { - // Override the ts field to create a more realistic situation for event time alignment - tsMilli.addAndGet(RECORD_BATCH_TS_INCREMENT_MILLI); - return RandomGenericData.generate(schema(), numRecords, seed).stream() - .peek( - record -> { - LocalDateTime ts = - LocalDateTime.ofInstant( - Instant.ofEpochMilli(tsMilli.addAndGet(RECORD_TS_INCREMENT_MILLI)), - ZoneId.of("Z")); - record.setField("ts", ts); - }) - .collect(Collectors.toList()); - } - - /** - * This override is needed because {@link Comparators} used by {@link StructLikeWrapper} retrieves - * Timestamp type using Long type as inner class, while the {@link RandomGenericData} generates - * {@link LocalDateTime} for {@code TimestampType.withoutZone()}. This method normalizes the - * {@link LocalDateTime} to a Long type so that Comparators can continue to work. - */ - @Override - protected void assertRecords(Table table, List expectedRecords, Duration timeout) - throws Exception { - List expectedNormalized = convertLocalDateTimeToMilli(expectedRecords); - Awaitility.await("expected list of records should be produced") - .atMost(timeout) - .untilAsserted( - () -> - SimpleDataUtil.assertRecordsEqual( - expectedNormalized, - convertLocalDateTimeToMilli(SimpleDataUtil.tableRecords(table)), - table.schema())); - } - - private List convertLocalDateTimeToMilli(List records) { - return records.stream() - .peek( - r -> { - LocalDateTime localDateTime = ((LocalDateTime) r.getField("ts")); - r.setField("ts", localDateTime.atZone(ZoneOffset.UTC).toInstant().toEpochMilli()); - }) - .collect(Collectors.toList()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceInferParallelism.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceInferParallelism.java deleted file mode 100644 index 2908cb927269..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceInferParallelism.java +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.lang.reflect.Field; -import java.nio.file.Path; -import java.util.List; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.core.execution.JobClient; -import org.apache.flink.runtime.executiongraph.AccessExecutionGraph; -import org.apache.flink.runtime.executiongraph.AccessExecutionJobVertex; -import org.apache.flink.runtime.minicluster.MiniCluster; -import org.apache.flink.runtime.testutils.InternalMiniClusterExtension; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Table; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.data.RowDataToRowMapper; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -public class TestIcebergSourceInferParallelism { - private static final int NUM_TMS = 2; - private static final int SLOTS_PER_TM = 2; - private static final int PARALLELISM = NUM_TMS * SLOTS_PER_TM; - private static final int MAX_INFERRED_PARALLELISM = 3; - - @RegisterExtension - private static final MiniClusterExtension MINI_CLUSTER_EXTENSION = - new MiniClusterExtension( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(NUM_TMS) - .setNumberSlotsPerTaskManager(SLOTS_PER_TM) - .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) - .build()); - - @RegisterExtension - protected static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); - - @TempDir private Path tmpDir; - - private Table table; - private GenericAppenderHelper dataAppender; - - @BeforeEach - public void before() throws IOException { - this.table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - this.dataAppender = new GenericAppenderHelper(table, FileFormat.PARQUET, tmpDir); - } - - @AfterEach - public void after() { - CATALOG_EXTENSION.catalog().dropTable(TestFixtures.TABLE_IDENTIFIER); - } - - @Test - public void testEmptyTable() throws Exception { - // Inferred parallelism should be at least 1 even if table is empty - test(1, 0); - } - - @Test - public void testTableWithFilesLessThanMaxInferredParallelism() throws Exception { - // Append files to the table - for (int i = 0; i < 2; ++i) { - List batch = RandomGenericData.generate(table.schema(), 1, 0); - dataAppender.appendToTable(batch); - } - - // Inferred parallelism should equal to 2 splits - test(2, 2); - } - - @Test - public void testTableWithFilesMoreThanMaxInferredParallelism() throws Exception { - // Append files to the table - for (int i = 0; i < MAX_INFERRED_PARALLELISM + 1; ++i) { - List batch = RandomGenericData.generate(table.schema(), 1, 0); - dataAppender.appendToTable(batch); - } - - // Inferred parallelism should be capped by the MAX_INFERRED_PARALLELISM - test(MAX_INFERRED_PARALLELISM, MAX_INFERRED_PARALLELISM + 1); - } - - private void test(int expectedParallelism, int expectedRecords) throws Exception { - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - env.setParallelism(PARALLELISM); - - Configuration config = new Configuration(); - config.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, true); - config.set( - FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX, - MAX_INFERRED_PARALLELISM); - - DataStream dataStream = - IcebergSource.forRowData() - .tableLoader(CATALOG_EXTENSION.tableLoader()) - .table(table) - .flinkConfig(config) - // force one file per split - .splitSize(1L) - .buildStream(env) - .map(new RowDataToRowMapper(FlinkSchemaUtil.convert(table.schema()))); - - DataStream.Collector collector = new DataStream.Collector<>(); - dataStream.collectAsync(collector); - JobClient jobClient = env.executeAsync(); - try (CloseableIterator iterator = collector.getOutput()) { - List result = Lists.newArrayList(); - while (iterator.hasNext()) { - result.add(iterator.next()); - } - - assertThat(result).hasSize(expectedRecords); - verifySourceParallelism( - expectedParallelism, miniCluster().getExecutionGraph(jobClient.getJobID()).get()); - } - } - - /** - * Borrowed this approach from Flink {@code FileSourceTextLinesITCase} to get source parallelism - * from execution graph. - */ - private static void verifySourceParallelism( - int expectedParallelism, AccessExecutionGraph executionGraph) { - AccessExecutionJobVertex sourceVertex = - executionGraph.getVerticesTopologically().iterator().next(); - assertThat(sourceVertex.getParallelism()).isEqualTo(expectedParallelism); - } - - /** - * Use reflection to get {@code InternalMiniClusterExtension} and {@code MiniCluster} to get - * execution graph and source parallelism. Haven't find other way via public APIS. - */ - private static MiniCluster miniCluster() throws Exception { - Field privateField = - MiniClusterExtension.class.getDeclaredField("internalMiniClusterExtension"); - privateField.setAccessible(true); - InternalMiniClusterExtension internalExtension = - (InternalMiniClusterExtension) privateField.get(MINI_CLUSTER_EXTENSION); - return internalExtension.getMiniCluster(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java deleted file mode 100644 index df148c212ebd..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.util.CloseableIterator; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.CatalogLoader; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.StructLikeSet; -import org.junit.jupiter.api.extension.RegisterExtension; - -public class TestIcebergSourceReaderDeletes extends TestFlinkReaderDeletesBase { - - private static final int PARALLELISM = 4; - - @RegisterExtension - private static final MiniClusterExtension MINI_CLUSTER = - MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); - - @Override - protected StructLikeSet rowSet(String tableName, Table testTable, String... columns) - throws IOException { - Schema projected = testTable.schema().select(columns); - RowType rowType = FlinkSchemaUtil.convert(projected); - - Map properties = Maps.newHashMap(); - properties.put( - CatalogProperties.WAREHOUSE_LOCATION, - hiveConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); - properties.put(CatalogProperties.URI, hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname)); - properties.put( - CatalogProperties.CLIENT_POOL_SIZE, - Integer.toString(hiveConf.getInt("iceberg.hive.client-pool-size", 5))); - CatalogLoader hiveCatalogLoader = CatalogLoader.hive(catalog.name(), hiveConf, properties); - TableLoader hiveTableLoader = - TableLoader.fromCatalog(hiveCatalogLoader, TableIdentifier.of("default", tableName)); - hiveTableLoader.open(); - try (TableLoader tableLoader = hiveTableLoader) { - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - env.setParallelism(1); - DataStream stream = - env.fromSource( - IcebergSource.builder() - .tableLoader(tableLoader) - .assignerFactory(new SimpleSplitAssignerFactory()) - .project(projected) - .build(), - WatermarkStrategy.noWatermarks(), - "testBasicRead", - TypeInformation.of(RowData.class)); - - try (CloseableIterator iter = stream.executeAndCollect()) { - List rowDataList = Lists.newArrayList(iter); - StructLikeSet set = StructLikeSet.create(projected.asStruct()); - rowDataList.forEach( - rowData -> { - RowDataWrapper wrapper = new RowDataWrapper(rowType, projected.asStruct()); - set.add(wrapper.wrap(rowData)); - }); - return set; - } catch (Exception e) { - throw new IOException("Failed to collect result", e); - } - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java deleted file mode 100644 index 0cdaf8371cbd..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.IOException; -import java.time.Instant; -import java.time.ZoneId; -import java.util.List; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.config.TableConfigOptions; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -/** Use the IcebergSource (FLIP-27) */ -public class TestIcebergSourceSql extends TestSqlBase { - private static final Schema SCHEMA_TS = - new Schema( - required(1, "t1", Types.TimestampType.withoutZone()), - required(2, "t2", Types.LongType.get())); - - @BeforeEach - @Override - public void before() throws IOException { - setUpTableEnv(getTableEnv()); - setUpTableEnv(getStreamingTableEnv()); - } - - private static void setUpTableEnv(TableEnvironment tableEnvironment) { - Configuration tableConf = tableEnvironment.getConfig().getConfiguration(); - tableConf.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE, true); - // Disable inferring parallelism to avoid interfering watermark tests - // that check split assignment is ordered by the watermark column. - // The tests assumes default parallelism of 1 with single reader task - // in order to check the order of read records. - tableConf.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); - - tableEnvironment.getConfig().set("table.exec.resource.default-parallelism", "1"); - SqlHelpers.sql( - tableEnvironment, - "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", - CATALOG_EXTENSION.warehouse()); - SqlHelpers.sql(tableEnvironment, "use catalog iceberg_catalog"); - - tableConf.set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); - } - - @AfterEach - public void after() throws IOException { - CATALOG_EXTENSION.catalog().dropTable(TestFixtures.TABLE_IDENTIFIER); - } - - private Record generateRecord(Instant t1, long t2) { - Record record = GenericRecord.create(SCHEMA_TS); - record.setField("t1", t1.atZone(ZoneId.systemDefault()).toLocalDateTime()); - record.setField("t2", t2); - return record; - } - - /** Generates the records in the expected order, with respect to their datafile */ - private List generateExpectedRecords(boolean ascending) throws Exception { - Table table = CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, SCHEMA_TS); - long baseTime = 1702382109000L; - - GenericAppenderHelper helper = - new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); - - Record file1Record1 = - generateRecord(Instant.ofEpochMilli(baseTime), baseTime + (1000 * 60 * 60 * 24 * 30L)); - Record file1Record2 = - generateRecord( - Instant.ofEpochMilli(baseTime - 10 * 1000L), baseTime + (1000 * 60 * 60 * 24 * 35L)); - - List recordsDataFile1 = Lists.newArrayList(); - recordsDataFile1.add(file1Record1); - recordsDataFile1.add(file1Record2); - DataFile dataFile1 = helper.writeFile(recordsDataFile1); - - Record file2Record1 = - generateRecord( - Instant.ofEpochMilli(baseTime + 14 * 1000L), baseTime - (1000 * 60 * 60 * 24 * 30L)); - Record file2Record2 = - generateRecord( - Instant.ofEpochMilli(baseTime + 12 * 1000L), baseTime - (1000 * 60 * 61 * 24 * 35L)); - - List recordsDataFile2 = Lists.newArrayList(); - recordsDataFile2.add(file2Record1); - recordsDataFile2.add(file2Record2); - - DataFile dataFile2 = helper.writeFile(recordsDataFile2); - helper.appendToTable(dataFile1, dataFile2); - - // Expected records if the splits are ordered - // - ascending (watermark from t1) - records from the split with early timestamps, then - // records from the split with late timestamps - // - descending (watermark from t2) - records from the split with old longs, then records - // from the split with new longs - List expected = Lists.newArrayList(); - if (ascending) { - expected.addAll(recordsDataFile1); - expected.addAll(recordsDataFile2); - } else { - expected.addAll(recordsDataFile2); - expected.addAll(recordsDataFile1); - } - return expected; - } - - /** Tests the order of splits returned when setting the watermark-column options */ - @Test - public void testWatermarkOptionsAscending() throws Exception { - List expected = generateExpectedRecords(true); - TestHelpers.assertRecordsWithOrder( - run( - ImmutableMap.of("watermark-column", "t1", "split-file-open-cost", "128000000"), - "", - "*"), - expected, - SCHEMA_TS); - } - - /** - * Tests the order of splits returned when setting the watermark-column and - * watermark-column-time-unit" options - */ - @Test - public void testWatermarkOptionsDescending() throws Exception { - List expected = generateExpectedRecords(false); - TestHelpers.assertRecordsWithOrder( - run( - ImmutableMap.of( - "watermark-column", - "t2", - "watermark-column-time-unit", - "MILLISECONDS", - "split-file-open-cost", - "128000000"), - "", - "*"), - expected, - SCHEMA_TS); - } - - @Test - public void testReadFlinkDynamicTable() throws Exception { - List expected = generateExpectedRecords(false); - SqlHelpers.sql( - getTableEnv(), - "create table `default_catalog`.`default_database`.flink_table LIKE iceberg_catalog.`default`.%s", - TestFixtures.TABLE); - - // Read from table in flink catalog - TestHelpers.assertRecords( - SqlHelpers.sql( - getTableEnv(), "select * from `default_catalog`.`default_database`.flink_table"), - expected, - SCHEMA_TS); - } - - @Test - public void testWatermarkInvalidConfig() { - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, SCHEMA_TS); - - String flinkTable = "`default_catalog`.`default_database`.flink_table"; - SqlHelpers.sql( - getStreamingTableEnv(), - "CREATE TABLE %s " - + "(eventTS AS CAST(t1 AS TIMESTAMP(3)), " - + "WATERMARK FOR eventTS AS SOURCE_WATERMARK()) LIKE iceberg_catalog.`default`.%s", - flinkTable, - TestFixtures.TABLE); - - assertThatThrownBy(() -> SqlHelpers.sql(getStreamingTableEnv(), "SELECT * FROM %s", flinkTable)) - .isInstanceOf(NullPointerException.class) - .hasMessage("watermark-column needs to be configured to use source watermark."); - } - - @Test - public void testWatermarkValidConfig() throws Exception { - List expected = generateExpectedRecords(true); - - String flinkTable = "`default_catalog`.`default_database`.flink_table"; - - SqlHelpers.sql( - getStreamingTableEnv(), - "CREATE TABLE %s " - + "(eventTS AS CAST(t1 AS TIMESTAMP(3)), " - + "WATERMARK FOR eventTS AS SOURCE_WATERMARK()) WITH ('watermark-column'='t1') LIKE iceberg_catalog.`default`.%s", - flinkTable, - TestFixtures.TABLE); - - TestHelpers.assertRecordsWithOrder( - SqlHelpers.sql( - getStreamingTableEnv(), - "SELECT t1, t2 FROM TABLE(TUMBLE(TABLE %s, DESCRIPTOR(eventTS), INTERVAL '1' SECOND))", - flinkTable), - expected, - SCHEMA_TS); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java deleted file mode 100644 index 70889f4f76aa..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java +++ /dev/null @@ -1,408 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; - -import java.io.Serializable; -import java.nio.file.Path; -import java.time.Duration; -import java.time.Instant; -import java.time.LocalDateTime; -import java.time.ZoneId; -import java.util.List; -import java.util.Optional; -import java.util.concurrent.ConcurrentMap; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; -import org.apache.flink.api.common.JobID; -import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.core.execution.JobClient; -import org.apache.flink.metrics.Gauge; -import org.apache.flink.runtime.metrics.MetricNames; -import org.apache.flink.runtime.minicluster.MiniCluster; -import org.apache.flink.runtime.minicluster.RpcServiceSharing; -import org.apache.flink.runtime.testutils.CommonTestUtils; -import org.apache.flink.runtime.testutils.InMemoryReporter; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.functions.windowing.AllWindowFunction; -import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows; -import org.apache.flink.streaming.api.windowing.time.Time; -import org.apache.flink.streaming.api.windowing.windows.TimeWindow; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.test.junit5.InjectMiniCluster; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.util.CloseableIterator; -import org.apache.flink.util.Collector; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.HadoopTableExtension; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -public class TestIcebergSourceWithWatermarkExtractor implements Serializable { - private static final int PARALLELISM = 4; - private static final String SOURCE_NAME = "IcebergSource"; - private static final int RECORD_NUM_FOR_2_SPLITS = 200; - private static final ConcurrentMap WINDOWS = Maps.newConcurrentMap(); - - @TempDir protected Path temporaryFolder; - - private static final InMemoryReporter REPORTER = InMemoryReporter.createWithRetainedMetrics(); - - @RegisterExtension - public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = - new MiniClusterExtension( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(1) - .setNumberSlotsPerTaskManager(PARALLELISM) - .setRpcServiceSharing(RpcServiceSharing.DEDICATED) - .setConfiguration(REPORTER.addToConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG)) - .withHaLeadershipControl() - .build()); - - @RegisterExtension - private static final HadoopTableExtension TABLE_EXTENSION = - new HadoopTableExtension(TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.TS_SCHEMA); - - /** - * This is an integration test for watermark handling and windowing. Integration testing the - * following features: - * - *
      - *
    • - Ordering of the splits - *
    • - Emitting of watermarks - *
    • - Firing windows based on watermarks - *
    - * - *

    The test generates 4 splits - * - *

      - *
    • - Split 1 - Watermark 100 min - *
    • - Split 2, 3 - Watermark 0 min - *
    • - Split 4 - Watermark 6 min - *
    - * - *

    Creates a source with 5 minutes tumbling window with parallelism 1 (to prevent concurrency - * issues). - * - *

    Checks that windows are handled correctly based on the emitted watermarks, and splits are - * read in the following order: - * - *

      - *
    • - Split 2, 3 - *
    • - Split 4 - *
    • - Split 1 - *
    - * - *

    As a result the window aggregator emits the records based on in Split 2-3, and Split 4 data. - * - *

    Add 2 more splits, so the task manager close the windows for the original 4 splits and emit - * the appropriate aggregated records. - */ - @Test - public void testWindowing() throws Exception { - GenericAppenderHelper dataAppender = appender(); - List expectedRecords = Lists.newArrayList(); - - // Generate records with the following pattern: - // - File 1 - Later records (Watermark 6000000) - // - Split 1 - 2 records (100, "file_1-recordTs_100"), (103, "file_1-recordTs_103") - // - File 2 - First records (Watermark 0) - // - Split 1 - 100 records (0, "file_2-recordTs_0"), (1, "file_2-recordTs_1"),... - // - Split 2 - 100 records (0, "file_2-recordTs_0"), (1, "file_2-recordTs_1"),... - // - File 3 - Parallel write for the first records (Watermark 360000) - // - Split 1 - 2 records (6, "file_3-recordTs_6"), (7, "file_3-recordTs_7") - List batch = - ImmutableList.of( - generateRecord(100, "file_1-recordTs_100"), - generateRecord(101, "file_1-recordTs_101"), - generateRecord(103, "file_1-recordTs_103")); - expectedRecords.addAll(batch); - dataAppender.appendToTable(batch); - - batch = Lists.newArrayListWithCapacity(100); - for (int i = 0; i < RECORD_NUM_FOR_2_SPLITS; ++i) { - // Generate records where the timestamps are out of order, but still between 0-5 minutes - batch.add(generateRecord(4 - i % 5, "file_2-recordTs_" + i)); - } - expectedRecords.addAll(batch); - dataAppender.appendToTable(batch); - - batch = - ImmutableList.of( - generateRecord(6, "file_3-recordTs_6"), generateRecord(7, "file_3-recordTs_7")); - expectedRecords.addAll(batch); - dataAppender.appendToTable(batch); - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - env.setParallelism(1); - - DataStream stream = - env.fromSource( - source(), - WatermarkStrategy.noWatermarks() - .withTimestampAssigner(new RowDataTimestampAssigner()), - SOURCE_NAME, - TypeInformation.of(RowData.class)); - - stream - .windowAll(TumblingEventTimeWindows.of(Time.minutes(5))) - .apply( - new AllWindowFunction() { - @Override - public void apply( - TimeWindow window, Iterable values, Collector out) { - // Emit RowData which contains the window start time, and the record count in - // that window - AtomicInteger count = new AtomicInteger(0); - values.forEach(a -> count.incrementAndGet()); - out.collect(row(window.getStart(), count.get())); - WINDOWS.put(window.getStart(), count.get()); - } - }); - - // Use static variable to collect the windows, since other solutions were flaky - WINDOWS.clear(); - env.executeAsync("Iceberg Source Windowing Test"); - - // Wait for the 2 first windows from File 2 and File 3 - Awaitility.await() - .pollInterval(Duration.ofMillis(10)) - .atMost(30, TimeUnit.SECONDS) - .until( - () -> - WINDOWS.equals( - ImmutableMap.of(0L, RECORD_NUM_FOR_2_SPLITS, TimeUnit.MINUTES.toMillis(5), 2))); - - // Write data so the windows containing test data are closed - dataAppender.appendToTable( - dataAppender.writeFile(ImmutableList.of(generateRecord(1500, "last-record")))); - - // Wait for last test record window from File 1 - Awaitility.await() - .pollInterval(Duration.ofMillis(10)) - .atMost(30, TimeUnit.SECONDS) - .until( - () -> - WINDOWS.equals( - ImmutableMap.of( - 0L, - RECORD_NUM_FOR_2_SPLITS, - TimeUnit.MINUTES.toMillis(5), - 2, - TimeUnit.MINUTES.toMillis(100), - 3))); - } - - /** - * This is an integration test for watermark handling and throttling. Integration testing the - * following: - * - *

      - *
    • - Emitting of watermarks - *
    • - Watermark alignment - *
    - * - *

    The test generates 3 splits - * - *

      - *
    • - Split 1 - Watermark 100 min - *
    • - Split 2, 3 - Watermark 0 min - *
    - * - * The splits are read in the following order: - * - *
      - *
    • - Split 2, 3 (Task Manager 1, Task Manager 2) - *
    • - Split 1 (Task Manager 1 or ask Manager 2 depending on scheduling) - *
    - * - * Reading split 1 will cause the watermark alignment to pause reading for the given task manager. - * - *

    The status of the watermark alignment is checked by the alignment related metrics. - * - *

    Adding new records with old timestamps to the table will enable the running reader to - * continue reading the files, but the watermark alignment will still prevent the paused reader to - * continue. - * - *

    After adding some records with new timestamps the blocked reader is un-paused, and both ot - * the readers continue reading. - */ - @Test - public void testThrottling(@InjectMiniCluster MiniCluster miniCluster) throws Exception { - GenericAppenderHelper dataAppender = appender(); - - // Generate records in advance - - // File 1 - Later records (Watermark 6.000.000 - 100 min) - // - Split 1 - 2 records (100, "file_1-recordTs_100"), (103, "file_1-recordTs_103") - List batch1 = - ImmutableList.of( - generateRecord(100, "file_1-recordTs_100"), generateRecord(103, "file_1-recordTs_103")); - - // File 2 - First records (Watermark 0 - 0 min) - // - Split 1 - 100 records (0, "file_2-recordTs_0"), (1, "file_2-recordTs_1"),... - // - Split 2 - 100 records (0, "file_2-recordTs_0"), (1, "file_2-recordTs_1"),... - List batch2 = Lists.newArrayListWithCapacity(100); - for (int i = 0; i < RECORD_NUM_FOR_2_SPLITS; ++i) { - batch2.add(generateRecord(4 - i % 5, "file_2-recordTs_" + i)); - } - - // File 3 - Some records will be blocked (Watermark 900.000 - 15 min) - List batch3 = - ImmutableList.of( - generateRecord(15, "file_3-recordTs_15"), - generateRecord(16, "file_3-recordTs_16"), - generateRecord(17, "file_3-recordTs_17")); - - // File 4 - Some records will be blocked (Watermark 900.000 - 15 min) - List batch4 = - ImmutableList.of( - generateRecord(15, "file_4-recordTs_15"), - generateRecord(16, "file_4-recordTs_16"), - generateRecord(17, "file_4-recordTs_17")); - - // File 5 - Records which will remove the block (Watermark 5.400.000 - 90 min) - List batch5 = - ImmutableList.of( - generateRecord(90, "file_5-recordTs_90"), generateRecord(91, "file_5-recordTs_91")); - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - env.setParallelism(2); - - DataStream stream = - env.fromSource( - source(), - WatermarkStrategy.noWatermarks() - .withWatermarkAlignment("iceberg", Duration.ofMinutes(20), Duration.ofMillis(10)), - SOURCE_NAME, - TypeInformation.of(RowData.class)); - - try (CloseableIterator resultIterator = stream.collectAsync()) { - JobClient jobClient = env.executeAsync("Iceberg Source Throttling Test"); - CommonTestUtils.waitForAllTaskRunning(miniCluster, jobClient.getJobID(), false); - - // Insert the first data into the table - dataAppender.appendToTable(dataAppender.writeFile(batch1), dataAppender.writeFile(batch2)); - - // Get the drift metric, wait for it to be created and reach the expected state - // (100 min - 20 min - 0 min) - // Also this validates that the WatermarkAlignment is working - Awaitility.await() - .pollInterval(Duration.ofMillis(10)) - .atMost(30, TimeUnit.SECONDS) - .until( - () -> - findAlignmentDriftMetric(jobClient.getJobID(), TimeUnit.MINUTES.toMillis(80)) - .isPresent()); - Gauge drift = - findAlignmentDriftMetric(jobClient.getJobID(), TimeUnit.MINUTES.toMillis(80)).get(); - - // Add some old records with 2 splits, so even if the blocked gets one split, the other reader - // one gets one as well - dataAppender.appendToTable(dataAppender.writeFile(batch3), dataAppender.writeFile(batch4)); - - // Get the drift metric, wait for it to be created and reach the expected state (100 min - 20 - // min - 15 min) - Awaitility.await() - .pollInterval(Duration.ofMillis(10)) - .atMost(30, TimeUnit.SECONDS) - .until(() -> drift.getValue() == TimeUnit.MINUTES.toMillis(65)); - - // Add some new records which should unblock the throttled reader - dataAppender.appendToTable(batch5); - - // Wait for the new drift to decrease below the allowed drift to signal the normal state - Awaitility.await() - .pollInterval(Duration.ofMillis(10)) - .atMost(30, TimeUnit.SECONDS) - .until(() -> drift.getValue() < TimeUnit.MINUTES.toMillis(20)); - } - } - - protected IcebergSource source() { - return IcebergSource.builder() - .tableLoader(TABLE_EXTENSION.tableLoader()) - .watermarkColumn("ts") - .project(TestFixtures.TS_SCHEMA) - .splitSize(100L) - .streaming(true) - .monitorInterval(Duration.ofMillis(10)) - .streamingStartingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - } - - protected Record generateRecord(int minutes, String str) { - // Override the ts field to create a more realistic situation for event time alignment - Record record = GenericRecord.create(TestFixtures.TS_SCHEMA); - LocalDateTime ts = - LocalDateTime.ofInstant( - Instant.ofEpochMilli(Time.of(minutes, TimeUnit.MINUTES).toMilliseconds()), - ZoneId.of("Z")); - record.setField("ts", ts); - record.setField("str", str); - return record; - } - - private Optional> findAlignmentDriftMetric(JobID jobID, long withValue) { - String metricsName = SOURCE_NAME + ".*" + MetricNames.WATERMARK_ALIGNMENT_DRIFT; - return REPORTER.findMetrics(jobID, metricsName).values().stream() - .map(m -> (Gauge) m) - .filter(m -> m.getValue() == withValue) - .findFirst(); - } - - private GenericAppenderHelper appender() { - // We need to create multiple splits, so we need to generate parquet files with multiple offsets - org.apache.hadoop.conf.Configuration hadoopConf = new org.apache.hadoop.conf.Configuration(); - hadoopConf.set("write.parquet.page-size-bytes", "64"); - hadoopConf.set("write.parquet.row-group-size-bytes", "64"); - return new GenericAppenderHelper( - TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder, hadoopConf); - } - - private static RowData row(long time, long count) { - GenericRowData result = new GenericRowData(2); - result.setField(0, time); - result.setField(1, String.valueOf(count)); - return result; - } - - private static class RowDataTimestampAssigner implements SerializableTimestampAssigner { - @Override - public long extractTimestamp(RowData element, long recordTimestamp) { - return element.getTimestamp(0, 0).getMillisecond(); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java deleted file mode 100644 index 7bd98c69ff36..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.nio.file.Files; -import java.time.Duration; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.api.common.RuntimeExecutionMode; -import org.apache.flink.api.common.TaskInfo; -import org.apache.flink.api.common.functions.RichMapFunction; -import org.apache.flink.api.common.typeinfo.Types; -import org.apache.flink.configuration.BatchExecutionOptions; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.configuration.JobManagerOptions; -import org.apache.flink.configuration.RestOptions; -import org.apache.flink.configuration.SlowTaskDetectorOptions; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.Table; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.TestBase; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.Timeout; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.extension.RegisterExtension; - -/** - * There is a infinite sleep in the test. Add a timeout to the test to avoid stuck situation in case - * anything goes wrong unexpectedly. - */ -@Timeout(value = 60) -@ExtendWith(ParameterizedTestExtension.class) -public class TestIcebergSpeculativeExecutionSupport extends TestBase { - private static final int NUM_TASK_MANAGERS = 1; - private static final int NUM_TASK_SLOTS = 3; - - @RegisterExtension - public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = - new MiniClusterExtension( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(NUM_TASK_MANAGERS) - .setNumberSlotsPerTaskManager(NUM_TASK_SLOTS) - .setConfiguration(configure()) - .build()); - - private StreamTableEnvironment tEnv; - private static final String CATALOG_NAME = "test_catalog"; - private static final String DATABASE_NAME = "test_db"; - private static final String INPUT_TABLE_NAME = "test_table"; - private static final String OUTPUT_TABLE_NAME = "sink_table"; - - @Override - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - StreamExecutionEnvironment env = - StreamExecutionEnvironment.getExecutionEnvironment(configure()); - env.setRuntimeMode(RuntimeExecutionMode.BATCH); - tEnv = StreamTableEnvironment.create(env); - } - } - - return tEnv; - } - - @Parameter(index = 0) - private boolean useV2Sink; - - @Parameters(name = "useV2Sink = {0}") - public static Object[][] parameters() { - return new Object[][] {{true}, {false}}; - } - - @BeforeEach - public void before() throws IOException { - String warehouse = - String.format("file:%s", Files.createTempDirectory(temporaryDirectory, "junit").toString()); - sql( - "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", - CATALOG_NAME, warehouse); - sql("USE CATALOG %s", CATALOG_NAME); - sql("CREATE DATABASE %s", DATABASE_NAME); - sql("USE %s", DATABASE_NAME); - - sql("CREATE TABLE %s (i INT, j INT)", INPUT_TABLE_NAME); - sql("INSERT INTO %s VALUES (1, -1),(2, -1),(3, -1)", INPUT_TABLE_NAME); - sql("CREATE TABLE %s (i INT, j INT, subTask INT, attempt INT)", OUTPUT_TABLE_NAME); - } - - @AfterEach - public void after() { - sql("DROP TABLE IF EXISTS %s.%s", DATABASE_NAME, INPUT_TABLE_NAME); - sql("DROP TABLE IF EXISTS %s.%s", DATABASE_NAME, OUTPUT_TABLE_NAME); - dropDatabase(DATABASE_NAME, true); - dropCatalog(CATALOG_NAME, true); - } - - @TestTemplate - public void testSpeculativeExecution() throws Exception { - tEnv.getConfig().set("table.exec.iceberg.use-v2-sink", String.valueOf(useV2Sink)); - Table table = - tEnv.sqlQuery(String.format("SELECT * FROM %s.%s", DATABASE_NAME, INPUT_TABLE_NAME)); - DataStream slowStream = - tEnv.toDataStream(table, Row.class) - .map(new TestingMap()) - .name("test_map") - .returns( - Types.ROW_NAMED( - new String[] {"i", "j", "subTask", "attempt"}, - Types.INT, - Types.INT, - Types.INT, - Types.INT)) - .setParallelism(NUM_TASK_SLOTS); - - tEnv.fromDataStream(slowStream) - .executeInsert(String.format("%s.%s", DATABASE_NAME, OUTPUT_TABLE_NAME)) - .await(); - - List output = sql(String.format("SELECT * FROM %s.%s", DATABASE_NAME, OUTPUT_TABLE_NAME)); - - // Ensure that all subTasks has attemptNum > 0 - assertThat(output.stream().map(x -> x.getField(3)).collect(Collectors.toSet())).contains(1); - - // Ensure the test_table rows are returned exactly the same after the slow map task from the - // sink_table - assertSameElements( - output.stream().map(x -> Row.of(x.getField(0), x.getField(1))).collect(Collectors.toList()), - Arrays.asList(Row.of(1, -1), Row.of(2, -1), Row.of(3, -1))); - } - - /** A testing map function that simulates the slow task. */ - private static class TestingMap extends RichMapFunction { - @Override - public Row map(Row row) throws Exception { - // Simulate slow subtask 0 with attempt 0 - TaskInfo taskInfo = getRuntimeContext().getTaskInfo(); - if (taskInfo.getIndexOfThisSubtask() == 0 && taskInfo.getAttemptNumber() <= 0) { - Thread.sleep(Integer.MAX_VALUE); - } - - Row output = - Row.of( - row.getField(0), - row.getField(1), - getRuntimeContext().getTaskInfo().getIndexOfThisSubtask(), - getRuntimeContext().getTaskInfo().getAttemptNumber()); - - return output; - } - } - - private static Configuration configure() { - Configuration configuration = new Configuration(); - configuration.set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - configuration.set(RestOptions.BIND_PORT, "0"); - configuration.set(JobManagerOptions.SLOT_REQUEST_TIMEOUT, 5000L); - - // Use FLIP-27 source - configuration.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE, true); - configuration.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); - - // for speculative execution - configuration.set(BatchExecutionOptions.SPECULATIVE_ENABLED, true); - - configuration.set(SlowTaskDetectorOptions.EXECUTION_TIME_BASELINE_MULTIPLIER, 1.0); - configuration.set(SlowTaskDetectorOptions.EXECUTION_TIME_BASELINE_RATIO, 0.2); - configuration.set( - SlowTaskDetectorOptions.EXECUTION_TIME_BASELINE_LOWER_BOUND, Duration.ofMillis(0)); - configuration.set(BatchExecutionOptions.BLOCK_SLOW_NODE_DURATION, Duration.ofMillis(0)); - - return configuration; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java deleted file mode 100644 index 488969bab045..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java +++ /dev/null @@ -1,364 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.nio.file.Path; -import java.util.Base64; -import java.util.List; -import java.util.Map; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.Files; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.FileHelpers; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.CatalogTestBase; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.io.TempDir; - -public class TestMetadataTableReadableMetrics extends CatalogTestBase { - private static final String TABLE_NAME = "test_table"; - - @Parameters(name = "catalogName={0}, baseNamespace={1}") - protected static List parameters() { - List parameters = Lists.newArrayList(); - String catalogName = "testhive"; - Namespace baseNamespace = Namespace.empty(); - parameters.add(new Object[] {catalogName, baseNamespace}); - return parameters; - } - - @Override - protected TableEnvironment getTableEnv() { - Configuration configuration = super.getTableEnv().getConfig().getConfiguration(); - configuration.set(CoreOptions.DEFAULT_PARALLELISM, 1); - return super.getTableEnv(); - } - - private @TempDir Path temp; - - private static final Types.StructType LEAF_STRUCT_TYPE = - Types.StructType.of( - optional(1, "leafLongCol", Types.LongType.get()), - optional(2, "leafDoubleCol", Types.DoubleType.get())); - - private static final Types.StructType NESTED_STRUCT_TYPE = - Types.StructType.of(required(3, "leafStructCol", LEAF_STRUCT_TYPE)); - - private static final Schema NESTED_SCHEMA = - new Schema(required(4, "nestedStructCol", NESTED_STRUCT_TYPE)); - - private static final Schema PRIMITIVE_SCHEMA = - new Schema( - required(1, "booleanCol", Types.BooleanType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "longCol", Types.LongType.get()), - required(4, "floatCol", Types.FloatType.get()), - required(5, "doubleCol", Types.DoubleType.get()), - optional(6, "decimalCol", Types.DecimalType.of(10, 2)), - optional(7, "stringCol", Types.StringType.get()), - optional(8, "fixedCol", Types.FixedType.ofLength(3)), - optional(9, "binaryCol", Types.BinaryType.get())); - - private Table createPrimitiveTable() throws IOException { - Table table = - catalog.createTable( - TableIdentifier.of(DATABASE, TABLE_NAME), - PRIMITIVE_SCHEMA, - PartitionSpec.unpartitioned(), - ImmutableMap.of()); - List records = - Lists.newArrayList( - createPrimitiveRecord( - false, - 1, - 1L, - 0, - 1.0D, - new BigDecimal("1.00"), - "1", - Base64.getDecoder().decode("1111"), - ByteBuffer.wrap(Base64.getDecoder().decode("1111"))), - createPrimitiveRecord( - true, - 2, - 2L, - 0, - 2.0D, - new BigDecimal("2.00"), - "2", - Base64.getDecoder().decode("2222"), - ByteBuffer.wrap(Base64.getDecoder().decode("2222"))), - createPrimitiveRecord(false, 1, 1, Float.NaN, Double.NaN, null, "1", null, null), - createPrimitiveRecord( - false, 2, 2L, Float.NaN, 2.0D, new BigDecimal("2.00"), "2", null, null)); - - File testFile = File.createTempFile("junit", null, temp.toFile()); - DataFile dataFile = FileHelpers.writeDataFile(table, Files.localOutput(testFile), records); - table.newAppend().appendFile(dataFile).commit(); - return table; - } - - private Table createNestedTable() throws IOException { - Table table = - validationCatalog.createTable( - TableIdentifier.of(DATABASE, TABLE_NAME), - NESTED_SCHEMA, - PartitionSpec.unpartitioned(), - ImmutableMap.of()); - - List records = - Lists.newArrayList( - createNestedRecord(0L, 0.0), - createNestedRecord(1L, Double.NaN), - createNestedRecord(null, null)); - - File testFile = File.createTempFile("junit", null, temp.toFile()); - DataFile dataFile = FileHelpers.writeDataFile(table, Files.localOutput(testFile), records); - table.newAppend().appendFile(dataFile).commit(); - - return table; - } - - @BeforeEach - public void before() { - super.before(); - sql("USE CATALOG %s", catalogName); - sql("CREATE DATABASE %s", DATABASE); - sql("USE %s", DATABASE); - } - - @Override - @AfterEach - public void clean() { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME); - dropDatabase(flinkDatabase, true); - super.clean(); - } - - protected GenericRecord createPrimitiveRecord( - boolean booleanCol, - int intCol, - long longCol, - float floatCol, - double doubleCol, - BigDecimal decimalCol, - String stringCol, - byte[] fixedCol, - ByteBuffer binaryCol) { - GenericRecord record = GenericRecord.create(PRIMITIVE_SCHEMA); - record.set(0, booleanCol); - record.set(1, intCol); - record.set(2, longCol); - record.set(3, floatCol); - record.set(4, doubleCol); - record.set(5, decimalCol); - record.set(6, stringCol); - record.set(7, fixedCol); - record.set(8, binaryCol); - return record; - } - - private GenericRecord createNestedRecord(Long longCol, Double doubleCol) { - GenericRecord record = GenericRecord.create(NESTED_SCHEMA); - GenericRecord nested = GenericRecord.create(NESTED_STRUCT_TYPE); - GenericRecord leaf = GenericRecord.create(LEAF_STRUCT_TYPE); - leaf.set(0, longCol); - leaf.set(1, doubleCol); - nested.set(0, leaf); - record.set(0, nested); - return record; - } - - protected Object[] row(Object... values) { - return values; - } - - @TestTemplate - public void testPrimitiveColumns() throws Exception { - Table table = createPrimitiveTable(); - List result = sql("SELECT readable_metrics FROM %s$files", TABLE_NAME); - - // With new releases of Parquet, new features might be added which cause the - // size of the column to increase. For example, with Parquet 1.14.x the - // uncompressed size has been added to allow for better allocation of memory upfront. - // Therefore, we look the sizes up, rather than hardcoding them - DataFile dataFile = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); - Map columnSizeStats = dataFile.columnSizes(); - - Row binaryCol = - Row.of( - columnSizeStats.get(PRIMITIVE_SCHEMA.findField("binaryCol").fieldId()), - 4L, - 2L, - null, - Base64.getDecoder().decode("1111"), - Base64.getDecoder().decode("2222")); - Row booleanCol = - Row.of( - columnSizeStats.get(PRIMITIVE_SCHEMA.findField("booleanCol").fieldId()), - 4L, - 0L, - null, - false, - true); - Row decimalCol = - Row.of( - columnSizeStats.get(PRIMITIVE_SCHEMA.findField("decimalCol").fieldId()), - 4L, - 1L, - null, - new BigDecimal("1.00"), - new BigDecimal("2.00")); - Row doubleCol = - Row.of( - columnSizeStats.get(PRIMITIVE_SCHEMA.findField("doubleCol").fieldId()), - 4L, - 0L, - 1L, - 1.0D, - 2.0D); - Row fixedCol = - Row.of( - columnSizeStats.get(PRIMITIVE_SCHEMA.findField("fixedCol").fieldId()), - 4L, - 2L, - null, - Base64.getDecoder().decode("1111"), - Base64.getDecoder().decode("2222")); - Row floatCol = - Row.of( - columnSizeStats.get(PRIMITIVE_SCHEMA.findField("floatCol").fieldId()), - 4L, - 0L, - 2L, - 0f, - 0f); - Row intCol = - Row.of( - columnSizeStats.get(PRIMITIVE_SCHEMA.findField("intCol").fieldId()), - 4L, - 0L, - null, - 1, - 2); - Row longCol = - Row.of( - columnSizeStats.get(PRIMITIVE_SCHEMA.findField("longCol").fieldId()), - 4L, - 0L, - null, - 1L, - 2L); - Row stringCol = - Row.of( - columnSizeStats.get(PRIMITIVE_SCHEMA.findField("stringCol").fieldId()), - 4L, - 0L, - null, - "1", - "2"); - - List expected = - Lists.newArrayList( - Row.of( - Row.of( - binaryCol, - booleanCol, - decimalCol, - doubleCol, - fixedCol, - floatCol, - intCol, - longCol, - stringCol))); - TestHelpers.assertRows(result, expected); - } - - @TestTemplate - public void testSelectPrimitiveValues() throws Exception { - createPrimitiveTable(); - - TestHelpers.assertRows( - sql( - "SELECT readable_metrics.intCol.lower_bound, readable_metrics.booleanCol.upper_bound FROM %s$files", - TABLE_NAME), - ImmutableList.of(Row.of(1, true))); - - TestHelpers.assertRows( - sql("SELECT content, readable_metrics.longCol.value_count FROM %s$files", TABLE_NAME), - ImmutableList.of(Row.of(0, 4L))); - - TestHelpers.assertRows( - sql("SELECT readable_metrics.longCol.value_count, content FROM %s$files", TABLE_NAME), - ImmutableList.of(Row.of(4L, 0))); - } - - @TestTemplate - public void testSelectNestedValues() throws Exception { - createNestedTable(); - TestHelpers.assertRows( - sql( - "SELECT readable_metrics.`nestedStructCol.leafStructCol.leafLongCol`.lower_bound, " - + "readable_metrics.`nestedStructCol.leafStructCol.leafDoubleCol`.value_count FROM %s$files", - TABLE_NAME), - ImmutableList.of(Row.of(0L, 3L))); - } - - @TestTemplate - public void testNestedValues() throws Exception { - createNestedTable(); - List result = sql("SELECT readable_metrics FROM %s$files", TABLE_NAME); - - // We have to take a slightly different approach, since we don't store - // the column sizes for nested fields. - long leafDoubleColSize = - (long) ((Row) ((Row) result.get(0).getField(0)).getField(0)).getField(0); - long leafLongColSize = (long) ((Row) ((Row) result.get(0).getField(0)).getField(1)).getField(0); - - Row leafDoubleCol = Row.of(leafDoubleColSize, 3L, 1L, 1L, 0.0D, 0.0D); - Row leafLongCol = Row.of(leafLongColSize, 3L, 1L, null, 0L, 1L); - Row metrics = Row.of(Row.of(leafDoubleCol, leafLongCol)); - - TestHelpers.assertRows(result, ImmutableList.of(metrics)); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java deleted file mode 100644 index ef8380c21613..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.List; -import java.util.Set; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.RowDelta; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.data.RowDataProjection; -import org.apache.iceberg.flink.sink.RowDataTaskWriterFactory; -import org.apache.iceberg.flink.sink.TaskWriterFactory; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestProjectMetaColumn { - - @TempDir protected Path temporaryFolder; - - @Parameter(index = 0) - private FileFormat format; - - @Parameters(name = "fileFormat={0}") - public static Iterable parameters() { - return Lists.newArrayList( - new Object[] {FileFormat.PARQUET}, - new Object[] {FileFormat.ORC}, - new Object[] {FileFormat.AVRO}); - } - - private void testSkipToRemoveMetaColumn(int formatVersion) throws IOException { - // Create the table with given format version. - String location = Files.createTempDirectory(temporaryFolder, "junit").toFile().toString(); - Table table = - SimpleDataUtil.createTable( - location, - ImmutableMap.of(TableProperties.FORMAT_VERSION, String.valueOf(formatVersion)), - false); - - List rows = - Lists.newArrayList( - SimpleDataUtil.createInsert(1, "AAA"), - SimpleDataUtil.createInsert(2, "BBB"), - SimpleDataUtil.createInsert(3, "CCC")); - writeAndCommit(table, ImmutableSet.of(), false, rows); - - FlinkInputFormat input = - FlinkSource.forRowData().tableLoader(TableLoader.fromHadoopTable(location)).buildFormat(); - - List results = Lists.newArrayList(); - TestHelpers.readRowData( - input, - rowData -> { - // If project to remove the meta columns, it will get a RowDataProjection. - assertThat(rowData).isInstanceOf(GenericRowData.class); - results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); - }); - - // Assert the results. - TestHelpers.assertRows(rows, results, SimpleDataUtil.ROW_TYPE); - } - - @TestTemplate - public void testV1SkipToRemoveMetaColumn() throws IOException { - testSkipToRemoveMetaColumn(1); - } - - @TestTemplate - public void testV2SkipToRemoveMetaColumn() throws IOException { - testSkipToRemoveMetaColumn(2); - } - - @TestTemplate - public void testV2RemoveMetaColumn() throws Exception { - // Create the v2 table. - String location = Files.createTempDirectory(temporaryFolder, "junit").toFile().toString(); - Table table = - SimpleDataUtil.createTable( - location, ImmutableMap.of(TableProperties.FORMAT_VERSION, "2"), false); - - List rows = - Lists.newArrayList( - SimpleDataUtil.createInsert(1, "AAA"), - SimpleDataUtil.createDelete(1, "AAA"), - SimpleDataUtil.createInsert(2, "AAA"), - SimpleDataUtil.createInsert(2, "BBB")); - int eqFieldId = table.schema().findField("data").fieldId(); - writeAndCommit(table, ImmutableSet.of(eqFieldId), true, rows); - - FlinkInputFormat input = - FlinkSource.forRowData().tableLoader(TableLoader.fromHadoopTable(location)).buildFormat(); - - List results = Lists.newArrayList(); - TestHelpers.readRowData( - input, - rowData -> { - // If project to remove the meta columns, it will get a RowDataProjection. - assertThat(rowData).isInstanceOf(RowDataProjection.class); - results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); - }); - - // Assert the results. - TestHelpers.assertRows( - ImmutableList.of( - SimpleDataUtil.createInsert(2, "AAA"), SimpleDataUtil.createInsert(2, "BBB")), - results, - SimpleDataUtil.ROW_TYPE); - } - - private void writeAndCommit( - Table table, Set eqFieldIds, boolean upsert, List rows) throws IOException { - TaskWriter writer = createTaskWriter(table, eqFieldIds, upsert); - try (TaskWriter io = writer) { - for (RowData row : rows) { - io.write(row); - } - } - - RowDelta delta = table.newRowDelta(); - WriteResult result = writer.complete(); - - for (DataFile dataFile : result.dataFiles()) { - delta.addRows(dataFile); - } - - for (DeleteFile deleteFile : result.deleteFiles()) { - delta.addDeletes(deleteFile); - } - - delta.commit(); - } - - private TaskWriter createTaskWriter( - Table table, Set equalityFieldIds, boolean upsert) { - TaskWriterFactory taskWriterFactory = - new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - SimpleDataUtil.ROW_TYPE, - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT, - format, - table.properties(), - equalityFieldIds, - upsert); - - taskWriterFactory.initialize(1, 1); - return taskWriterFactory.create(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java deleted file mode 100644 index 6ef40693827e..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.avro.generic.GenericRecord; -import org.apache.iceberg.flink.AvroGenericRecordConverterBase; -import org.apache.iceberg.flink.DataGenerator; - -public class TestRowDataToAvroGenericRecordConverter extends AvroGenericRecordConverterBase { - @Override - protected void testConverter(DataGenerator dataGenerator) { - RowDataToAvroGenericRecordConverter converter = - RowDataToAvroGenericRecordConverter.fromAvroSchema(dataGenerator.avroSchema()); - GenericRecord expected = dataGenerator.generateAvroGenericRecord(); - GenericRecord actual = converter.apply(dataGenerator.generateFlinkRowData()); - assertThat(actual).isEqualTo(expected); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java deleted file mode 100644 index 5dd7de545e11..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import org.junit.jupiter.api.Test; - -class TestScanContext { - @Test - void testIncrementalFromSnapshotId() { - ScanContext context = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .build(); - assertException( - context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); - - context = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(1L) - .startSnapshotTimestamp(1L) - .build(); - assertException( - context, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); - } - - @Test - void testIncrementalFromSnapshotTimestamp() { - ScanContext context = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .build(); - assertException( - context, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); - - context = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotId(1L) - .startSnapshotTimestamp(1L) - .build(); - assertException( - context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); - } - - @Test - void testStreaming() { - ScanContext context = ScanContext.builder().streaming(true).useTag("tag").build(); - assertException(context, "Cannot scan table using ref tag configured for streaming reader"); - - context = ScanContext.builder().streaming(true).useSnapshotId(1L).build(); - assertException(context, "Cannot set snapshot-id option for streaming reader"); - - context = ScanContext.builder().streaming(true).asOfTimestamp(1L).build(); - assertException(context, "Cannot set as-of-timestamp option for streaming reader"); - - context = ScanContext.builder().streaming(true).endSnapshotId(1L).build(); - assertException(context, "Cannot set end-snapshot-id option for streaming reader"); - - context = ScanContext.builder().streaming(true).endTag("tag").build(); - assertException(context, "Cannot set end-tag option for streaming reader"); - } - - @Test - void testStartConflict() { - ScanContext context = ScanContext.builder().startTag("tag").startSnapshotId(1L).build(); - assertException(context, "START_SNAPSHOT_ID and START_TAG cannot both be set."); - } - - @Test - void testEndConflict() { - ScanContext context = ScanContext.builder().endTag("tag").endSnapshotId(1L).build(); - assertException(context, "END_SNAPSHOT_ID and END_TAG cannot both be set."); - } - - @Test - void testMaxAllowedPlanningFailures() { - ScanContext context = ScanContext.builder().maxAllowedPlanningFailures(-2).build(); - assertException( - context, "Cannot set maxAllowedPlanningFailures to a negative number other than -1."); - } - - private void assertException(ScanContext context, String message) { - assertThatThrownBy(() -> context.validate()) - .hasMessage(message) - .isInstanceOf(IllegalArgumentException.class); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java deleted file mode 100644 index b701419a7499..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import org.apache.flink.configuration.Configuration; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.junit.jupiter.api.Test; - -public class TestSourceUtil { - @Test - public void testInferedParallelism() throws IOException { - Configuration configuration = new Configuration(); - // Empty table, infer parallelism should be at least 1 - int parallelism = SourceUtil.inferParallelism(configuration, -1L, () -> 0); - assertThat(parallelism).isEqualTo(1); - - // 2 splits (max infer is the default value 100 , max > splits num), the parallelism is splits - // num : 2 - parallelism = SourceUtil.inferParallelism(configuration, -1L, () -> 2); - assertThat(parallelism).isEqualTo(2); - - // 2 splits and limit is 1 , max infer parallelism is default 100, - // which is greater than splits num and limit, the parallelism is the limit value : 1 - parallelism = SourceUtil.inferParallelism(configuration, 1, () -> 2); - assertThat(parallelism).isEqualTo(1); - - // 2 splits and max infer parallelism is 1 (max < splits num), the parallelism is 1 - configuration.setInteger(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX, 1); - parallelism = SourceUtil.inferParallelism(configuration, -1L, () -> 2); - assertThat(parallelism).isEqualTo(1); - - // 2 splits, max infer parallelism is 1, limit is 3, the parallelism is max infer parallelism : - // 1 - parallelism = SourceUtil.inferParallelism(configuration, 3, () -> 2); - assertThat(parallelism).isEqualTo(1); - - // 2 splits, infer parallelism is disabled, the parallelism is flink default parallelism 1 - configuration.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); - parallelism = SourceUtil.inferParallelism(configuration, 3, () -> 2); - assertThat(parallelism).isEqualTo(1); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java deleted file mode 100644 index dd63154fe03b..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.flink.TestFixtures.DATABASE; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.List; -import java.util.Map; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Table; -import org.apache.iceberg.TestHelpers; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -/** Test other more advanced usage of SQL. They don't need to run for every file format. */ -public abstract class TestSqlBase { - @RegisterExtension - public static MiniClusterExtension miniClusterExtension = - MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); - - @RegisterExtension - public static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); - - @TempDir protected Path temporaryFolder; - - private volatile TableEnvironment tEnv; - - private volatile TableEnvironment streamingTEnv; - - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - this.tEnv = - TableEnvironment.create(EnvironmentSettings.newInstance().inBatchMode().build()); - } - } - } - return tEnv; - } - - protected TableEnvironment getStreamingTableEnv() { - if (streamingTEnv == null) { - synchronized (this) { - if (streamingTEnv == null) { - this.streamingTEnv = - TableEnvironment.create(EnvironmentSettings.newInstance().inStreamingMode().build()); - } - } - } - - return streamingTEnv; - } - - @BeforeEach - public abstract void before() throws IOException; - - @Test - public void testResiduals() throws Exception { - Table table = - CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); - - List writeRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); - writeRecords.get(0).set(1, 123L); - writeRecords.get(0).set(2, "2020-03-20"); - writeRecords.get(1).set(1, 456L); - writeRecords.get(1).set(2, "2020-03-20"); - - GenericAppenderHelper helper = - new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); - - List expectedRecords = Lists.newArrayList(); - expectedRecords.add(writeRecords.get(0)); - - DataFile dataFile1 = helper.writeFile(TestHelpers.Row.of("2020-03-20", 0), writeRecords); - DataFile dataFile2 = - helper.writeFile( - TestHelpers.Row.of("2020-03-21", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); - helper.appendToTable(dataFile1, dataFile2); - - org.apache.iceberg.flink.TestHelpers.assertRecords( - run(Maps.newHashMap(), "where dt='2020-03-20' and id=123", "*"), - expectedRecords, - TestFixtures.SCHEMA); - } - - @Test - public void testExposeLocality() throws Exception { - Table table = - CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); - - TableLoader tableLoader = TableLoader.fromHadoopTable(table.location()); - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 10, 0L); - expectedRecords.forEach(expectedRecord -> expectedRecord.set(2, "2020-03-20")); - - GenericAppenderHelper helper = - new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); - DataFile dataFile = helper.writeFile(TestHelpers.Row.of("2020-03-20", 0), expectedRecords); - helper.appendToTable(dataFile); - - // test sql api - Configuration tableConf = getTableEnv().getConfig().getConfiguration(); - tableConf.setBoolean( - FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), false); - - List results = SqlHelpers.sql(getTableEnv(), "select * from t"); - org.apache.iceberg.flink.TestHelpers.assertRecords( - results, expectedRecords, TestFixtures.SCHEMA); - - // test table api - tableConf.setBoolean( - FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), true); - FlinkSource.Builder builder = FlinkSource.forRowData().tableLoader(tableLoader).table(table); - - // When running with CI or local, `localityEnabled` will be false even if this configuration is - // enabled - assertThat(SourceUtil.isLocalityEnabled(table, tableConf, true)) - .as("Expose split locality info should be false.") - .isFalse(); - - results = run(Maps.newHashMap(), "where dt='2020-03-20'", "*"); - org.apache.iceberg.flink.TestHelpers.assertRecords( - results, expectedRecords, TestFixtures.SCHEMA); - } - - protected List run( - Map options, String sqlFilter, String... sqlSelectedFields) { - String select = String.join(",", sqlSelectedFields); - String optionStr = SqlHelpers.sqlOptionsToString(options); - return SqlHelpers.sql(getTableEnv(), "select %s from t %s %s", select, optionStr, sqlFilter); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java deleted file mode 100644 index 2f3e0f78ba10..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java +++ /dev/null @@ -1,490 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import org.apache.flink.core.execution.JobClient; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.ExplainDetail; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.table.api.config.TableConfigOptions; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.Table; -import org.apache.iceberg.TestHelpers; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.CatalogTestBase; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.Timeout; - -@Timeout(60) -public class TestStreamScanSql extends CatalogTestBase { - private static final String TABLE = "test_table"; - private static final FileFormat FORMAT = FileFormat.PARQUET; - - private volatile int defaultJobParallelism; - - private volatile TableEnvironment tEnv; - - @Override - protected TableEnvironment getTableEnv() { - TableEnvironment tableEnv = tEnv; - if (tableEnv != null) { - return tableEnv; - } - synchronized (this) { - if (tEnv == null) { - EnvironmentSettings.Builder settingsBuilder = - EnvironmentSettings.newInstance().inStreamingMode(); - - StreamExecutionEnvironment env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); - env.enableCheckpointing(400); - - StreamTableEnvironment streamTableEnv = - StreamTableEnvironment.create(env, settingsBuilder.build()); - defaultJobParallelism = env.getParallelism(); - streamTableEnv - .getConfig() - .getConfiguration() - .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); - tEnv = streamTableEnv; - } - } - return tEnv; - } - - @Override - @BeforeEach - public void before() { - super.before(); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - } - - @Override - @AfterEach - public void clean() { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE); - dropDatabase(flinkDatabase, true); - super.clean(); - } - - private void insertRows(String partition, Table table, Row... rows) throws IOException { - insertRows(partition, SnapshotRef.MAIN_BRANCH, table, rows); - } - - private void insertRows(String partition, String branch, Table table, Row... rows) - throws IOException { - GenericAppenderHelper appender = new GenericAppenderHelper(table, FORMAT, temporaryDirectory); - - GenericRecord gRecord = GenericRecord.create(table.schema()); - List records = Lists.newArrayList(); - for (Row row : rows) { - records.add( - gRecord.copy( - "id", row.getField(0), - "data", row.getField(1), - "dt", row.getField(2))); - } - - if (partition != null) { - appender.appendToTable(TestHelpers.Row.of(partition, 0), branch, records); - } else { - appender.appendToTable(branch, records); - } - } - - private void insertRowsInBranch(String branch, Table table, Row... rows) throws IOException { - insertRows(null, branch, table, rows); - } - - private void insertRows(Table table, Row... rows) throws IOException { - insertRows(null, table, rows); - } - - private void assertRows(List expectedRows, Iterator iterator) { - for (Row expectedRow : expectedRows) { - assertThat(iterator).hasNext(); - Row actualRow = iterator.next(); - assertThat(actualRow.getArity()).isEqualTo(3); - assertThat(actualRow.getField(0)).isEqualTo(expectedRow.getField(0)); - assertThat(actualRow.getField(1)).isEqualTo(expectedRow.getField(1)); - assertThat(actualRow.getField(2)).isEqualTo(expectedRow.getField(2)); - } - } - - @TestTemplate - public void testUnPartitionedTable() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - TableResult result = - exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); - try (CloseableIterator iterator = result.collect()) { - - Row row1 = Row.of(1, "aaa", "2021-01-01"); - insertRows(table, row1); - assertRows(ImmutableList.of(row1), iterator); - - Row row2 = Row.of(2, "bbb", "2021-01-01"); - insertRows(table, row2); - assertRows(ImmutableList.of(row2), iterator); - } - result.getJobClient().ifPresent(JobClient::cancel); - } - - @TestTemplate - public void testPartitionedTable() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR) PARTITIONED BY (dt)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - TableResult result = - exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); - try (CloseableIterator iterator = result.collect()) { - Row row1 = Row.of(1, "aaa", "2021-01-01"); - insertRows("2021-01-01", table, row1); - assertRows(ImmutableList.of(row1), iterator); - - Row row2 = Row.of(2, "bbb", "2021-01-02"); - insertRows("2021-01-02", table, row2); - assertRows(ImmutableList.of(row2), iterator); - - Row row3 = Row.of(1, "aaa", "2021-01-02"); - insertRows("2021-01-02", table, row3); - assertRows(ImmutableList.of(row3), iterator); - - Row row4 = Row.of(2, "bbb", "2021-01-01"); - insertRows("2021-01-01", table, row4); - assertRows(ImmutableList.of(row4), iterator); - } - result.getJobClient().ifPresent(JobClient::cancel); - } - - @TestTemplate - public void testConsumeFromBeginning() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - Row row1 = Row.of(1, "aaa", "2021-01-01"); - Row row2 = Row.of(2, "bbb", "2021-01-01"); - insertRows(table, row1, row2); - - TableResult result = - exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); - try (CloseableIterator iterator = result.collect()) { - assertRows(ImmutableList.of(row1, row2), iterator); - - Row row3 = Row.of(3, "ccc", "2021-01-01"); - insertRows(table, row3); - assertRows(ImmutableList.of(row3), iterator); - - Row row4 = Row.of(4, "ddd", "2021-01-01"); - insertRows(table, row4); - assertRows(ImmutableList.of(row4), iterator); - } - result.getJobClient().ifPresent(JobClient::cancel); - } - - @TestTemplate - /** - * Insert records on the main branch. Then, insert in a named branch. Reads from the main branch - * and assert that the only records from main are returned - */ - public void testConsumeFilesFromMainBranch() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - // Produce two snapshots on main branch - Row row1 = Row.of(1, "aaa", "2021-01-01"); - Row row2 = Row.of(2, "bbb", "2021-01-01"); - - insertRows(table, row1, row2); - String branchName = "b1"; - table.manageSnapshots().createBranch(branchName).commit(); - - // insert on the 'b1' branch - Row row3 = Row.of(3, "ccc", "2021-01-01"); - Row row4 = Row.of(4, "ddd", "2021-01-01"); - - insertRowsInBranch(branchName, table, row3, row4); - - // read from main - TableResult result = - exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); - - try (CloseableIterator iterator = result.collect()) { - // the start snapshot(row2) is exclusive. - assertRows(ImmutableList.of(row1, row2), iterator); - - Row row5 = Row.of(5, "eee", "2021-01-01"); - Row row6 = Row.of(6, "fff", "2021-01-01"); - insertRows(table, row5, row6); - assertRows(ImmutableList.of(row5, row6), iterator); - - Row row7 = Row.of(7, "ggg", "2021-01-01"); - insertRows(table, row7); - assertRows(ImmutableList.of(row7), iterator); - } - result.getJobClient().ifPresent(JobClient::cancel); - } - - @TestTemplate - /** - * Insert records on the main branch. Creates a named branch. Insert record on named branch. Then - * select from the named branch and assert all the records are returned. - */ - public void testConsumeFilesFromBranch() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - // Produce two snapshots on main branch - Row row1 = Row.of(1, "aaa", "2021-01-01"); - Row row2 = Row.of(2, "bbb", "2021-01-01"); - - insertRows(table, row1, row2); - String branchName = "b1"; - table.manageSnapshots().createBranch(branchName).commit(); - - TableResult result = - exec( - "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'branch'='%s')*/ ", - TABLE, branchName); - - try (CloseableIterator iterator = result.collect()) { - assertRows(ImmutableList.of(row1, row2), iterator); - // insert on the 'b1' branch - Row row3 = Row.of(3, "ccc", "2021-01-01"); - Row row4 = Row.of(4, "ddd", "2021-01-01"); - insertRowsInBranch(branchName, table, row3, row4); - assertRows(ImmutableList.of(row3, row4), iterator); - } - result.getJobClient().ifPresent(JobClient::cancel); - } - - @TestTemplate - /** - * Insert records on branch b1. Then insert record on b2. Then select from each branch and assert - * the correct records are returned - */ - public void testConsumeFilesFromTwoBranches() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - String branch1 = "b1"; - String branch2 = "b2"; - table.manageSnapshots().createBranch(branch1).commit(); - table.manageSnapshots().createBranch(branch2).commit(); - - // Produce two snapshots on main branch - Row row1Branch1 = Row.of(1, "b1", "2021-01-01"); - Row row2Branch1 = Row.of(2, "b1", "2021-01-01"); - - Row row1Branch2 = Row.of(2, "b2", "2021-01-01"); - Row row2Branch2 = Row.of(3, "b3", "2021-01-01"); - - insertRowsInBranch(branch1, table, row1Branch1, row2Branch1); - insertRowsInBranch(branch2, table, row1Branch2, row2Branch2); - - TableResult resultBranch1 = - exec( - "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'branch'='%s')*/ ", - TABLE, branch1); - - try (CloseableIterator iterator = resultBranch1.collect()) { - assertRows(ImmutableList.of(row1Branch1, row2Branch1), iterator); - Row another = Row.of(4, "ccc", "2021-01-01"); - insertRowsInBranch(branch1, table, another); - assertRows(ImmutableList.of(another), iterator); - } - - TableResult resultBranch2 = - exec( - "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'branch'='%s')*/ ", - TABLE, branch2); - try (CloseableIterator iterator = resultBranch2.collect()) { - assertRows(ImmutableList.of(row1Branch2, row2Branch2), iterator); - Row another = Row.of(4, "ccc", "2021-01-01"); - insertRowsInBranch(branch2, table, another); - assertRows(ImmutableList.of(another), iterator); - } - - resultBranch1.getJobClient().ifPresent(JobClient::cancel); - resultBranch2.getJobClient().ifPresent(JobClient::cancel); - } - - @TestTemplate - public void testConsumeFromStartSnapshotId() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - // Produce two snapshots. - Row row1 = Row.of(1, "aaa", "2021-01-01"); - Row row2 = Row.of(2, "bbb", "2021-01-01"); - insertRows(table, row1); - insertRows(table, row2); - - long startSnapshotId = table.currentSnapshot().snapshotId(); - - Row row3 = Row.of(3, "ccc", "2021-01-01"); - Row row4 = Row.of(4, "ddd", "2021-01-01"); - insertRows(table, row3, row4); - - TableResult result = - exec( - "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', " - + "'start-snapshot-id'='%d')*/", - TABLE, startSnapshotId); - try (CloseableIterator iterator = result.collect()) { - // the start snapshot(row2) is exclusive. - assertRows(ImmutableList.of(row3, row4), iterator); - - Row row5 = Row.of(5, "eee", "2021-01-01"); - Row row6 = Row.of(6, "fff", "2021-01-01"); - insertRows(table, row5, row6); - assertRows(ImmutableList.of(row5, row6), iterator); - - Row row7 = Row.of(7, "ggg", "2021-01-01"); - insertRows(table, row7); - assertRows(ImmutableList.of(row7), iterator); - } - result.getJobClient().ifPresent(JobClient::cancel); - } - - @TestTemplate - public void testConsumeFromStartTag() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - // Produce two snapshots. - Row row1 = Row.of(1, "aaa", "2021-01-01"); - Row row2 = Row.of(2, "bbb", "2021-01-01"); - insertRows(table, row1); - insertRows(table, row2); - - String tagName = "t1"; - long startSnapshotId = table.currentSnapshot().snapshotId(); - table.manageSnapshots().createTag(tagName, startSnapshotId).commit(); - - Row row3 = Row.of(3, "ccc", "2021-01-01"); - Row row4 = Row.of(4, "ddd", "2021-01-01"); - insertRows(table, row3, row4); - - TableResult result = - exec( - "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', " - + "'start-tag'='%s')*/", - TABLE, tagName); - try (CloseableIterator iterator = result.collect()) { - // the start snapshot(row2) is exclusive. - assertRows(ImmutableList.of(row3, row4), iterator); - - Row row5 = Row.of(5, "eee", "2021-01-01"); - Row row6 = Row.of(6, "fff", "2021-01-01"); - insertRows(table, row5, row6); - assertRows(ImmutableList.of(row5, row6), iterator); - - Row row7 = Row.of(7, "ggg", "2021-01-01"); - insertRows(table, row7); - assertRows(ImmutableList.of(row7), iterator); - } - result.getJobClient().ifPresent(JobClient::cancel); - - assertThatThrownBy( - () -> - exec( - "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'start-tag'='%s', " - + "'start-snapshot-id'='%d' )*/", - TABLE, tagName, startSnapshotId)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("START_SNAPSHOT_ID and START_TAG cannot both be set."); - } - - @TestTemplate - void testWithParallelismWithProps() { - int customScanParallelism = defaultJobParallelism + 1; - sql( - "CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR) WITH ('scan.parallelism'='%s')", - TABLE, customScanParallelism); - - final org.apache.flink.table.api.Table table = - getTableEnv().sqlQuery(String.format("select * from %s", TABLE)); - final String explain = table.explain(ExplainDetail.JSON_EXECUTION_PLAN); - final String expectedPhysicalExecutionPlanFragment = - "\"parallelism\" : " + customScanParallelism; - assertThat(explain).contains(expectedPhysicalExecutionPlanFragment); - } - - @TestTemplate - void testWithParallelismWithHints() { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); - int customScanParallelism = defaultJobParallelism + 1; - - final org.apache.flink.table.api.Table table = - getTableEnv() - .sqlQuery( - String.format( - "select * from %s/*+ OPTIONS('streaming'='true', 'scan.parallelism'='%s') */", - TABLE, customScanParallelism)); - final String explain = table.explain(ExplainDetail.JSON_EXECUTION_PLAN); - final String expectedPhysicalExecutionPlanFragment = - "\"parallelism\" : " + customScanParallelism; - assertThat(explain).contains(expectedPhysicalExecutionPlanFragment); - } - - @TestTemplate - void testWithParallelismHintsOverride() { - int scanParallelismInCreateTable = defaultJobParallelism + 1; - sql( - "CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR) WITH ('scan.parallelism'='%s')", - TABLE, scanParallelismInCreateTable); - - int scanParallelismInHints = defaultJobParallelism + 2; - final org.apache.flink.table.api.Table table = - getTableEnv() - .sqlQuery( - String.format( - "select * from %s/*+ OPTIONS('streaming'='true', 'scan.parallelism'='%s') */", - TABLE, scanParallelismInHints)); - final String explain = table.explain(ExplainDetail.JSON_EXECUTION_PLAN); - final String expectedPhysicalExecutionPlanFragment = - "\"parallelism\" : " + scanParallelismInHints; - assertThat(explain).contains(expectedPhysicalExecutionPlanFragment); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java deleted file mode 100644 index 1080362af278..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java +++ /dev/null @@ -1,399 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.File; -import java.io.IOException; -import java.time.Duration; -import java.util.Arrays; -import java.util.List; -import java.util.concurrent.CountDownLatch; -import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; -import org.apache.flink.streaming.api.functions.source.SourceFunction; -import org.apache.flink.streaming.api.operators.StreamSource; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.flink.streaming.util.AbstractStreamOperatorTestHarness; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.Row; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.TestBase; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.TestTableLoader; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.SnapshotUtil; -import org.apache.iceberg.util.ThreadPools; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestStreamingMonitorFunction extends TestBase { - - private static final Schema SCHEMA = - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get())); - private static final FileFormat DEFAULT_FORMAT = FileFormat.PARQUET; - private static final long WAIT_TIME_MILLIS = 10 * 1000L; - - @Parameters(name = "formatVersion = {0}") - protected static List parameters() { - return Arrays.asList(1, 2); - } - - @BeforeEach - @Override - public void setupTable() throws IOException { - this.metadataDir = new File(tableDir, "metadata"); - - // Construct the iceberg table. - table = create(SCHEMA, PartitionSpec.unpartitioned()); - } - - private void runSourceFunctionInTask( - TestSourceContext sourceContext, StreamingMonitorFunction function) { - Thread task = - new Thread( - () -> { - try { - function.run(sourceContext); - } catch (Exception e) { - throw new RuntimeException(e); - } - }); - task.start(); - } - - @TestTemplate - public void testConsumeWithoutStartSnapshotId() throws Exception { - List> recordsList = generateRecordsAndCommitTxn(10); - ScanContext scanContext = ScanContext.builder().monitorInterval(Duration.ofMillis(100)).build(); - - StreamingMonitorFunction function = createFunction(scanContext); - try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { - harness.setup(); - harness.open(); - - TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); - runSourceFunctionInTask(sourceContext, function); - - awaitExpectedSplits(sourceContext); - - // Stop the stream task. - function.close(); - - TestHelpers.assertRecords( - sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); - } - } - - @TestTemplate - public void testConsumeFromStartSnapshotId() throws Exception { - // Commit the first five transactions. - generateRecordsAndCommitTxn(5); - long startSnapshotId = table.currentSnapshot().snapshotId(); - - // Commit the next five transactions. - List> recordsList = generateRecordsAndCommitTxn(5); - - ScanContext scanContext = - ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .startSnapshotId(startSnapshotId) - .build(); - - StreamingMonitorFunction function = createFunction(scanContext); - try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { - harness.setup(); - harness.open(); - - TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); - runSourceFunctionInTask(sourceContext, function); - - awaitExpectedSplits(sourceContext); - - // Stop the stream task. - function.close(); - - TestHelpers.assertRecords( - sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); - } - } - - @TestTemplate - public void testConsumeFromStartTag() throws Exception { - // Commit the first five transactions. - generateRecordsAndCommitTxn(5); - long startSnapshotId = table.currentSnapshot().snapshotId(); - String tagName = "t1"; - table.manageSnapshots().createTag(tagName, startSnapshotId).commit(); - - // Commit the next five transactions. - List> recordsList = generateRecordsAndCommitTxn(5); - - ScanContext scanContext = - ScanContext.builder().monitorInterval(Duration.ofMillis(100)).startTag(tagName).build(); - - StreamingMonitorFunction function = createFunction(scanContext); - try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { - harness.setup(); - harness.open(); - - TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); - runSourceFunctionInTask(sourceContext, function); - - awaitExpectedSplits(sourceContext); - - // Stop the stream task. - function.close(); - - TestHelpers.assertRecords( - sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); - } - } - - @TestTemplate - public void testCheckpointRestore() throws Exception { - List> recordsList = generateRecordsAndCommitTxn(10); - ScanContext scanContext = ScanContext.builder().monitorInterval(Duration.ofMillis(100)).build(); - - StreamingMonitorFunction func = createFunction(scanContext); - OperatorSubtaskState state; - try (AbstractStreamOperatorTestHarness harness = createHarness(func)) { - harness.setup(); - harness.open(); - - TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); - runSourceFunctionInTask(sourceContext, func); - - awaitExpectedSplits(sourceContext); - - state = harness.snapshot(1, 1); - - // Stop the stream task. - func.close(); - - TestHelpers.assertRecords( - sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); - } - - List> newRecordsList = generateRecordsAndCommitTxn(10); - StreamingMonitorFunction newFunc = createFunction(scanContext); - try (AbstractStreamOperatorTestHarness harness = createHarness(newFunc)) { - harness.setup(); - // Recover to process the remaining snapshots. - harness.initializeState(state); - harness.open(); - - TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); - runSourceFunctionInTask(sourceContext, newFunc); - - awaitExpectedSplits(sourceContext); - - // Stop the stream task. - newFunc.close(); - - TestHelpers.assertRecords( - sourceContext.toRows(), Lists.newArrayList(Iterables.concat(newRecordsList)), SCHEMA); - } - } - - private void awaitExpectedSplits(TestSourceContext sourceContext) { - Awaitility.await("expected splits should be produced") - .atMost(Duration.ofMillis(WAIT_TIME_MILLIS)) - .untilAsserted( - () -> { - assertThat(sourceContext.latch.getCount()).isEqualTo(0); - assertThat(sourceContext.splits).as("Should produce the expected splits").hasSize(1); - }); - } - - @TestTemplate - public void testInvalidMaxPlanningSnapshotCount() { - ScanContext scanContext1 = - ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .maxPlanningSnapshotCount(0) - .build(); - - assertThatThrownBy(() -> createFunction(scanContext1)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("The max-planning-snapshot-count must be greater than zero"); - - ScanContext scanContext2 = - ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .maxPlanningSnapshotCount(-10) - .build(); - - assertThatThrownBy(() -> createFunction(scanContext2)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("The max-planning-snapshot-count must be greater than zero"); - } - - @TestTemplate - public void testConsumeWithMaxPlanningSnapshotCount() throws Exception { - generateRecordsAndCommitTxn(10); - - // Use the oldest snapshot as starting to avoid the initial case. - long oldestSnapshotId = SnapshotUtil.oldestAncestor(table).snapshotId(); - - ScanContext scanContext = - ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .splitSize(1000L) - .startSnapshotId(oldestSnapshotId) - .maxPlanningSnapshotCount(Integer.MAX_VALUE) - .build(); - - FlinkInputSplit[] expectedSplits = - FlinkSplitPlanner.planInputSplits(table, scanContext, ThreadPools.getWorkerPool()); - - assertThat(expectedSplits).hasSize(9); - - // This covers three cases that maxPlanningSnapshotCount is less than, equal or greater than the - // total splits number - for (int maxPlanningSnapshotCount : ImmutableList.of(1, 9, 15)) { - scanContext = - ScanContext.builder() - .monitorInterval(Duration.ofMillis(500)) - .startSnapshotId(oldestSnapshotId) - .splitSize(1000L) - .maxPlanningSnapshotCount(maxPlanningSnapshotCount) - .build(); - - StreamingMonitorFunction function = createFunction(scanContext); - try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { - harness.setup(); - harness.open(); - - CountDownLatch latch = new CountDownLatch(1); - TestSourceContext sourceContext = new TestSourceContext(latch); - function.sourceContext(sourceContext); - function.monitorAndForwardSplits(); - - if (maxPlanningSnapshotCount < 10) { - assertThat(sourceContext.splits).hasSize(maxPlanningSnapshotCount); - } - } - } - } - - private List> generateRecordsAndCommitTxn(int commitTimes) throws IOException { - List> expectedRecords = Lists.newArrayList(); - for (int i = 0; i < commitTimes; i++) { - List records = RandomGenericData.generate(SCHEMA, 100, 0L); - expectedRecords.add(records); - - // Commit those records to iceberg table. - writeRecords(records); - } - return expectedRecords; - } - - private void writeRecords(List records) throws IOException { - GenericAppenderHelper appender = new GenericAppenderHelper(table, DEFAULT_FORMAT, temp); - appender.appendToTable(records); - } - - private StreamingMonitorFunction createFunction(ScanContext scanContext) { - return new StreamingMonitorFunction( - TestTableLoader.of(tableDir.getAbsolutePath()), scanContext); - } - - private AbstractStreamOperatorTestHarness createHarness( - StreamingMonitorFunction function) throws Exception { - StreamSource streamSource = - new StreamSource<>(function); - return new AbstractStreamOperatorTestHarness<>(streamSource, 1, 1, 0); - } - - private class TestSourceContext implements SourceFunction.SourceContext { - private final List splits = Lists.newArrayList(); - private final Object checkpointLock = new Object(); - private final CountDownLatch latch; - - TestSourceContext(CountDownLatch latch) { - this.latch = latch; - } - - @Override - public void collect(FlinkInputSplit element) { - splits.add(element); - latch.countDown(); - } - - @Override - public void collectWithTimestamp(FlinkInputSplit element, long timestamp) { - collect(element); - } - - @Override - public void emitWatermark(Watermark mark) {} - - @Override - public void markAsTemporarilyIdle() {} - - @Override - public Object getCheckpointLock() { - return checkpointLock; - } - - @Override - public void close() {} - - private List toRows() throws IOException { - FlinkInputFormat format = - FlinkSource.forRowData() - .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) - .buildFormat(); - - List rows = Lists.newArrayList(); - for (FlinkInputSplit split : splits) { - format.open(split); - - RowData element = null; - try { - while (!format.reachedEnd()) { - element = format.nextRecord(element); - rows.add(Row.of(element.getInt(0), element.getString(1).toString())); - } - } finally { - format.close(); - } - } - - return rows; - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java deleted file mode 100644 index 59c618f7a888..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java +++ /dev/null @@ -1,290 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; -import org.apache.flink.streaming.api.TimeCharacteristic; -import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; -import org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor; -import org.apache.flink.streaming.runtime.tasks.mailbox.MailboxDefaultAction; -import org.apache.flink.streaming.runtime.tasks.mailbox.SteppingMailboxProcessor; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.Row; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.TestBase; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.TestTableLoader; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.SnapshotUtil; -import org.apache.iceberg.util.ThreadPools; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestStreamingReaderOperator extends TestBase { - - private static final Schema SCHEMA = - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get())); - private static final FileFormat DEFAULT_FORMAT = FileFormat.PARQUET; - - @Parameters(name = "formatVersion = {0}") - protected static List parameters() { - return Arrays.asList(1, 2); - } - - @BeforeEach - @Override - public void setupTable() throws IOException { - this.metadataDir = new File(tableDir, "metadata"); - - // Construct the iceberg table. - table = create(SCHEMA, PartitionSpec.unpartitioned()); - } - - @TestTemplate - public void testProcessAllRecords() throws Exception { - List> expectedRecords = generateRecordsAndCommitTxn(10); - - List splits = generateSplits(); - assertThat(splits).hasSize(10); - - try (OneInputStreamOperatorTestHarness harness = createReader()) { - harness.setup(); - harness.open(); - - SteppingMailboxProcessor processor = createLocalMailbox(harness); - - List expected = Lists.newArrayList(); - for (int i = 0; i < splits.size(); i++) { - // Process this element to enqueue to mail-box. - harness.processElement(splits.get(i), -1); - - // Run the mail-box once to read all records from the given split. - assertThat(processor.runMailboxStep()).as("Should processed 1 split").isTrue(); - - // Assert the output has expected elements. - expected.addAll(expectedRecords.get(i)); - TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); - } - } - } - - @TestTemplate - public void testTriggerCheckpoint() throws Exception { - // Received emitted splits: split1, split2, split3, checkpoint request is triggered when reading - // records from - // split1. - List> expectedRecords = generateRecordsAndCommitTxn(3); - - List splits = generateSplits(); - assertThat(splits).hasSize(3); - - long timestamp = 0; - try (OneInputStreamOperatorTestHarness harness = createReader()) { - harness.setup(); - harness.open(); - - SteppingMailboxProcessor processor = createLocalMailbox(harness); - - harness.processElement(splits.get(0), ++timestamp); - harness.processElement(splits.get(1), ++timestamp); - harness.processElement(splits.get(2), ++timestamp); - - // Trigger snapshot state, it will start to work once all records from split0 are read. - processor.getMainMailboxExecutor().execute(() -> harness.snapshot(1, 3), "Trigger snapshot"); - - assertThat(processor.runMailboxStep()).as("Should have processed the split0").isTrue(); - assertThat(processor.runMailboxStep()) - .as("Should have processed the snapshot state action") - .isTrue(); - - TestHelpers.assertRecords(readOutputValues(harness), expectedRecords.get(0), SCHEMA); - - // Read records from split1. - assertThat(processor.runMailboxStep()).as("Should have processed the split1").isTrue(); - - // Read records from split2. - assertThat(processor.runMailboxStep()).as("Should have processed the split2").isTrue(); - - TestHelpers.assertRecords( - readOutputValues(harness), Lists.newArrayList(Iterables.concat(expectedRecords)), SCHEMA); - } - } - - @TestTemplate - public void testCheckpointRestore() throws Exception { - List> expectedRecords = generateRecordsAndCommitTxn(15); - - List splits = generateSplits(); - assertThat(splits).hasSize(15); - - OperatorSubtaskState state; - List expected = Lists.newArrayList(); - try (OneInputStreamOperatorTestHarness harness = createReader()) { - harness.setup(); - harness.open(); - - // Enqueue all the splits. - for (FlinkInputSplit split : splits) { - harness.processElement(split, -1); - } - - // Read all records from the first five splits. - SteppingMailboxProcessor localMailbox = createLocalMailbox(harness); - for (int i = 0; i < 5; i++) { - expected.addAll(expectedRecords.get(i)); - assertThat(localMailbox.runMailboxStep()) - .as("Should have processed the split#" + i) - .isTrue(); - - TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); - } - - // Snapshot state now, there're 10 splits left in the state. - state = harness.snapshot(1, 1); - } - - expected.clear(); - try (OneInputStreamOperatorTestHarness harness = createReader()) { - harness.setup(); - // Recover to process the remaining splits. - harness.initializeState(state); - harness.open(); - - SteppingMailboxProcessor localMailbox = createLocalMailbox(harness); - - for (int i = 5; i < 10; i++) { - expected.addAll(expectedRecords.get(i)); - assertThat(localMailbox.runMailboxStep()) - .as("Should have processed the split#" + i) - .isTrue(); - - TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); - } - - // Let's process the final 5 splits now. - for (int i = 10; i < 15; i++) { - expected.addAll(expectedRecords.get(i)); - harness.processElement(splits.get(i), 1); - - assertThat(localMailbox.runMailboxStep()) - .as("Should have processed the split#" + i) - .isTrue(); - TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); - } - } - } - - private List readOutputValues( - OneInputStreamOperatorTestHarness harness) { - List results = Lists.newArrayList(); - for (RowData rowData : harness.extractOutputValues()) { - results.add(Row.of(rowData.getInt(0), rowData.getString(1).toString())); - } - return results; - } - - private List> generateRecordsAndCommitTxn(int commitTimes) throws IOException { - List> expectedRecords = Lists.newArrayList(); - for (int i = 0; i < commitTimes; i++) { - List records = RandomGenericData.generate(SCHEMA, 100, 0L); - expectedRecords.add(records); - - // Commit those records to iceberg table. - writeRecords(records); - } - return expectedRecords; - } - - private void writeRecords(List records) throws IOException { - GenericAppenderHelper appender = new GenericAppenderHelper(table, DEFAULT_FORMAT, temp); - appender.appendToTable(records); - } - - private List generateSplits() { - List inputSplits = Lists.newArrayList(); - - List snapshotIds = SnapshotUtil.currentAncestorIds(table); - for (int i = snapshotIds.size() - 1; i >= 0; i--) { - ScanContext scanContext; - if (i == snapshotIds.size() - 1) { - // Generate the splits from the first snapshot. - scanContext = ScanContext.builder().useSnapshotId(snapshotIds.get(i)).build(); - } else { - // Generate the splits between the previous snapshot and current snapshot. - scanContext = - ScanContext.builder() - .startSnapshotId(snapshotIds.get(i + 1)) - .endSnapshotId(snapshotIds.get(i)) - .build(); - } - - Collections.addAll( - inputSplits, - FlinkSplitPlanner.planInputSplits(table, scanContext, ThreadPools.getWorkerPool())); - } - - return inputSplits; - } - - private OneInputStreamOperatorTestHarness createReader() - throws Exception { - // This input format is used to opening the emitted split. - FlinkInputFormat inputFormat = - FlinkSource.forRowData() - .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) - .buildFormat(); - - OneInputStreamOperatorFactory factory = - StreamingReaderOperator.factory(inputFormat); - OneInputStreamOperatorTestHarness harness = - new OneInputStreamOperatorTestHarness<>(factory, 1, 1, 0); - harness.getStreamConfig().setTimeCharacteristic(TimeCharacteristic.ProcessingTime); - - return harness; - } - - private SteppingMailboxProcessor createLocalMailbox( - OneInputStreamOperatorTestHarness harness) { - return new SteppingMailboxProcessor( - MailboxDefaultAction.Controller::suspendDefaultAction, - harness.getTaskMailbox(), - StreamTaskActionExecutor.IMMEDIATE); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java deleted file mode 100644 index 1e612b0a2b2a..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.fail; - -import java.nio.file.Path; -import java.util.Collection; -import java.util.List; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.atomic.AtomicBoolean; -import org.apache.iceberg.flink.source.SplitHelpers; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public abstract class SplitAssignerTestBase { - @TempDir protected Path temporaryFolder; - - @Test - public void testEmptyInitialization() { - SplitAssigner assigner = splitAssigner(); - assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - } - - /** Test a sequence of interactions for StaticEnumerator */ - @Test - public void testStaticEnumeratorSequence() throws Exception { - SplitAssigner assigner = splitAssigner(); - assigner.onDiscoveredSplits(createSplits(4, 1, "1")); - - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertSnapshot(assigner, 1); - assigner.onUnassignedSplits(createSplits(1, 1, "1")); - assertSnapshot(assigner, 2); - - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - assertSnapshot(assigner, 0); - } - - /** Test a sequence of interactions for ContinuousEnumerator */ - @Test - public void testContinuousEnumeratorSequence() throws Exception { - SplitAssigner assigner = splitAssigner(); - assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - - List splits1 = createSplits(1, 1, "1"); - assertAvailableFuture(assigner, 1, () -> assigner.onDiscoveredSplits(splits1)); - List splits2 = createSplits(1, 1, "1"); - assertAvailableFuture(assigner, 1, () -> assigner.onUnassignedSplits(splits2)); - - assigner.onDiscoveredSplits(createSplits(2, 1, "1")); - assertSnapshot(assigner, 2); - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - assertSnapshot(assigner, 0); - } - - private void assertAvailableFuture( - SplitAssigner assigner, int splitCount, Runnable addSplitsRunnable) { - // register callback - AtomicBoolean futureCompleted = new AtomicBoolean(); - CompletableFuture future = assigner.isAvailable(); - future.thenAccept(ignored -> futureCompleted.set(true)); - // calling isAvailable again should return the same object reference - // note that thenAccept will return a new future. - // we want to assert the same instance on the assigner returned future - assertThat(assigner.isAvailable()).isSameAs(future); - - // now add some splits - addSplitsRunnable.run(); - assertThat(futureCompleted.get()).isTrue(); - - for (int i = 0; i < splitCount; ++i) { - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - } - assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - assertSnapshot(assigner, 0); - } - - protected void assertGetNext(SplitAssigner assigner, GetSplitResult.Status expectedStatus) { - GetSplitResult result = assigner.getNext(null); - assertThat(result.status()).isEqualTo(expectedStatus); - switch (expectedStatus) { - case AVAILABLE: - assertThat(result.split()).isNotNull(); - break; - case CONSTRAINED: - case UNAVAILABLE: - assertThat(result.split()).isNull(); - break; - default: - fail("Unknown status: %s", expectedStatus); - } - } - - protected void assertSnapshot(SplitAssigner assigner, int splitCount) { - Collection stateBeforeGet = assigner.state(); - assertThat(stateBeforeGet).hasSize(splitCount); - } - - protected List createSplits(int fileCount, int filesPerSplit, String version) - throws Exception { - return SplitHelpers.createSplitsFromTransientHadoopTable( - temporaryFolder, fileCount, filesPerSplit, version); - } - - protected abstract SplitAssigner splitAssigner(); -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java deleted file mode 100644 index 17e64bbf0594..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import org.apache.iceberg.flink.source.SplitHelpers; -import org.junit.jupiter.api.Test; - -public class TestDefaultSplitAssigner extends SplitAssignerTestBase { - @Override - protected SplitAssigner splitAssigner() { - return new DefaultSplitAssigner(null); - } - - /** Test the assigner when multiple files are in a single split */ - @Test - public void testMultipleFilesInASplit() throws Exception { - SplitAssigner assigner = splitAssigner(); - assigner.onDiscoveredSplits( - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 4, 2)); - - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertSnapshot(assigner, 1); - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - assertSnapshot(assigner, 0); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java deleted file mode 100644 index 2b65977fb2f9..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.util.List; -import org.apache.iceberg.ContentFile; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.SerializableComparator; -import org.apache.iceberg.flink.source.split.SplitComparators; -import org.apache.iceberg.util.SerializationUtil; -import org.junit.jupiter.api.Test; - -public class TestFileSequenceNumberBasedSplitAssigner extends SplitAssignerTestBase { - @Override - protected SplitAssigner splitAssigner() { - return new OrderedSplitAssignerFactory(SplitComparators.fileSequenceNumber()).createAssigner(); - } - - /** Test the assigner when multiple files are in a single split */ - @Test - public void testMultipleFilesInAnIcebergSplit() { - SplitAssigner assigner = splitAssigner(); - assertThatThrownBy( - () -> assigner.onDiscoveredSplits(createSplits(4, 2, "2")), - "Multiple files in a split is not allowed") - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Please use 'split-file-open-cost'"); - } - - /** Test sorted splits */ - @Test - public void testSplitSort() throws Exception { - SplitAssigner assigner = splitAssigner(); - List splits = createSplits(5, 1, "2"); - - assigner.onDiscoveredSplits(splits.subList(3, 5)); - assigner.onDiscoveredSplits(splits.subList(0, 1)); - assigner.onDiscoveredSplits(splits.subList(1, 3)); - - assertGetNext(assigner, 1L); - assertGetNext(assigner, 2L); - assertGetNext(assigner, 3L); - assertGetNext(assigner, 4L); - assertGetNext(assigner, 5L); - - assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - } - - @Test - public void testSerializable() { - byte[] bytes = SerializationUtil.serializeToBytes(SplitComparators.fileSequenceNumber()); - SerializableComparator comparator = - SerializationUtil.deserializeFromBytes(bytes); - assertThat(comparator).isNotNull(); - } - - private void assertGetNext(SplitAssigner assigner, Long expectedSequenceNumber) { - GetSplitResult result = assigner.getNext(null); - ContentFile file = result.split().task().files().iterator().next().file(); - assertThat(file.fileSequenceNumber()).isEqualTo(expectedSequenceNumber); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java deleted file mode 100644 index 84f04d5a530a..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.time.Instant; -import java.time.LocalDateTime; -import java.time.ZoneOffset; -import java.time.temporal.ChronoUnit; -import java.util.List; -import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.reader.ColumnStatsWatermarkExtractor; -import org.apache.iceberg.flink.source.reader.ReaderUtil; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.SerializableComparator; -import org.apache.iceberg.flink.source.split.SplitComparators; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.SerializationUtil; -import org.junit.jupiter.api.Test; - -public class TestWatermarkBasedSplitAssigner extends SplitAssignerTestBase { - public static final Schema SCHEMA = - new Schema(required(1, "timestamp_column", Types.TimestampType.withoutZone())); - private static final GenericAppenderFactory APPENDER_FACTORY = new GenericAppenderFactory(SCHEMA); - - @Override - protected SplitAssigner splitAssigner() { - return new OrderedSplitAssignerFactory( - SplitComparators.watermark( - new ColumnStatsWatermarkExtractor(SCHEMA, "timestamp_column", null))) - .createAssigner(); - } - - /** Test the assigner when multiple files are in a single split */ - @Test - public void testMultipleFilesInAnIcebergSplit() { - SplitAssigner assigner = splitAssigner(); - assigner.onDiscoveredSplits(createSplits(4, 2, "2")); - - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - } - - /** Test sorted splits */ - @Test - public void testSplitSort() { - SplitAssigner assigner = splitAssigner(); - - Instant now = Instant.now(); - List splits = - IntStream.range(0, 5) - .mapToObj(i -> splitFromInstant(now.plus(i, ChronoUnit.MINUTES))) - .collect(Collectors.toList()); - - assigner.onDiscoveredSplits(splits.subList(3, 5)); - assigner.onDiscoveredSplits(splits.subList(0, 1)); - assigner.onDiscoveredSplits(splits.subList(1, 3)); - - assertGetNext(assigner, splits.get(0)); - assertGetNext(assigner, splits.get(1)); - assertGetNext(assigner, splits.get(2)); - assertGetNext(assigner, splits.get(3)); - assertGetNext(assigner, splits.get(4)); - - assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - } - - @Test - public void testSerializable() { - byte[] bytes = - SerializationUtil.serializeToBytes( - SplitComparators.watermark( - new ColumnStatsWatermarkExtractor( - TestFixtures.SCHEMA, "id", TimeUnit.MILLISECONDS))); - SerializableComparator comparator = - SerializationUtil.deserializeFromBytes(bytes); - assertThat(comparator).isNotNull(); - } - - private void assertGetNext(SplitAssigner assigner, IcebergSourceSplit split) { - GetSplitResult result = assigner.getNext(null); - assertThat(split).isEqualTo(result.split()); - } - - @Override - protected List createSplits( - int fileCount, int filesPerSplit, String version) { - return IntStream.range(0, fileCount / filesPerSplit) - .mapToObj( - splitNum -> - splitFromRecords( - IntStream.range(0, filesPerSplit) - .mapToObj( - fileNum -> - RandomGenericData.generate( - SCHEMA, 2, (long) splitNum * filesPerSplit + fileNum)) - .collect(Collectors.toList()))) - .collect(Collectors.toList()); - } - - private IcebergSourceSplit splitFromInstant(Instant instant) { - Record record = GenericRecord.create(SCHEMA); - record.set(0, LocalDateTime.ofInstant(instant, ZoneOffset.UTC)); - return splitFromRecords(ImmutableList.of(ImmutableList.of(record))); - } - - private IcebergSourceSplit splitFromRecords(List> records) { - try { - return IcebergSourceSplit.fromCombinedScanTask( - ReaderUtil.createCombinedScanTask( - records, temporaryFolder, FileFormat.PARQUET, APPENDER_FACTORY)); - } catch (IOException e) { - throw new RuntimeException("Split creation exception", e); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java deleted file mode 100644 index ebc92df02360..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.io.IOException; -import java.util.List; -import java.util.NavigableMap; -import java.util.TreeMap; -import org.apache.iceberg.flink.source.ScanContext; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -class ManualContinuousSplitPlanner implements ContinuousSplitPlanner { - private final int maxPlanningSnapshotCount; - // track splits per snapshot - private final NavigableMap> splits; - private long latestSnapshotId; - private int remainingFailures; - - ManualContinuousSplitPlanner(ScanContext scanContext, int expectedFailures) { - this.maxPlanningSnapshotCount = scanContext.maxPlanningSnapshotCount(); - this.splits = new TreeMap<>(); - this.latestSnapshotId = 0L; - this.remainingFailures = expectedFailures; - } - - @Override - public synchronized ContinuousEnumerationResult planSplits( - IcebergEnumeratorPosition lastPosition) { - if (remainingFailures > 0) { - remainingFailures--; - throw new RuntimeException("Expected failure at planning"); - } - - long fromSnapshotIdExclusive = 0; - if (lastPosition != null && lastPosition.snapshotId() != null) { - fromSnapshotIdExclusive = lastPosition.snapshotId(); - } - - Preconditions.checkArgument( - fromSnapshotIdExclusive <= latestSnapshotId, - "last enumerated snapshotId is greater than the latestSnapshotId"); - if (fromSnapshotIdExclusive == latestSnapshotId) { - // already discovered everything. - return new ContinuousEnumerationResult(Lists.newArrayList(), lastPosition, lastPosition); - } - - // find the subset of snapshots to return discovered splits - long toSnapshotIdInclusive; - if (latestSnapshotId - fromSnapshotIdExclusive > maxPlanningSnapshotCount) { - toSnapshotIdInclusive = fromSnapshotIdExclusive + maxPlanningSnapshotCount; - } else { - toSnapshotIdInclusive = latestSnapshotId; - } - - List discoveredSplits = Lists.newArrayList(); - NavigableMap> discoveredView = - splits.subMap(fromSnapshotIdExclusive, false, toSnapshotIdInclusive, true); - discoveredView.forEach((snapshotId, snapshotSplits) -> discoveredSplits.addAll(snapshotSplits)); - ContinuousEnumerationResult result = - new ContinuousEnumerationResult( - discoveredSplits, - lastPosition, - // use the snapshot Id as snapshot timestamp. - IcebergEnumeratorPosition.of(toSnapshotIdInclusive, toSnapshotIdInclusive)); - return result; - } - - /** - * Add a collection of new splits. A monotonically increased snapshotId is assigned to each batch - * of splits added by this method. - */ - public synchronized void addSplits(List newSplits) { - latestSnapshotId += 1; - splits.put(latestSnapshotId, newSplits); - } - - @Override - public void close() throws IOException {} -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java deleted file mode 100644 index 41a787762fda..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java +++ /dev/null @@ -1,352 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.nio.file.Path; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.flink.connector.testutils.source.reader.TestingSplitEnumeratorContext; -import org.apache.iceberg.flink.source.ScanContext; -import org.apache.iceberg.flink.source.SplitHelpers; -import org.apache.iceberg.flink.source.StreamingStartingStrategy; -import org.apache.iceberg.flink.source.assigner.DefaultSplitAssigner; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitStatus; -import org.apache.iceberg.flink.source.split.SplitRequestEvent; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public class TestContinuousIcebergEnumerator { - @TempDir protected Path temporaryFolder; - - @Test - public void testDiscoverSplitWhenNoReaderRegistered() throws Exception { - TestingSplitEnumeratorContext enumeratorContext = - new TestingSplitEnumeratorContext<>(4); - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 0); - ContinuousIcebergEnumerator enumerator = - createEnumerator(enumeratorContext, scanContext, splitPlanner); - - Collection pendingSplitsEmpty = - enumerator.snapshotState(1).pendingSplits(); - assertThat(pendingSplitsEmpty).isEmpty(); - - // make one split available and trigger the periodic discovery - List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); - splitPlanner.addSplits(splits); - enumeratorContext.triggerAllActions(); - - Collection pendingSplits = enumerator.snapshotState(2).pendingSplits(); - assertThat(pendingSplits).hasSize(1); - IcebergSourceSplitState pendingSplit = pendingSplits.iterator().next(); - assertThat(pendingSplit.split().splitId()).isEqualTo(splits.get(0).splitId()); - assertThat(pendingSplit.status()).isEqualTo(IcebergSourceSplitStatus.UNASSIGNED); - } - - @Test - public void testDiscoverWhenReaderRegistered() throws Exception { - TestingSplitEnumeratorContext enumeratorContext = - new TestingSplitEnumeratorContext<>(4); - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 0); - ContinuousIcebergEnumerator enumerator = - createEnumerator(enumeratorContext, scanContext, splitPlanner); - - // register one reader, and let it request a split - enumeratorContext.registerReader(2, "localhost"); - enumerator.addReader(2); - enumerator.handleSourceEvent(2, new SplitRequestEvent()); - - // make one split available and trigger the periodic discovery - List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); - splitPlanner.addSplits(splits); - enumeratorContext.triggerAllActions(); - - assertThat(enumerator.snapshotState(1).pendingSplits()).isEmpty(); - assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) - .contains(splits.get(0)); - } - - @Test - public void testRequestingReaderUnavailableWhenSplitDiscovered() throws Exception { - TestingSplitEnumeratorContext enumeratorContext = - new TestingSplitEnumeratorContext<>(4); - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 0); - ContinuousIcebergEnumerator enumerator = - createEnumerator(enumeratorContext, scanContext, splitPlanner); - - // register one reader, and let it request a split - enumeratorContext.registerReader(2, "localhost"); - enumerator.addReader(2); - enumerator.handleSourceEvent(2, new SplitRequestEvent()); - - // remove the reader (like in a failure) - enumeratorContext.registeredReaders().remove(2); - - // make one split available and trigger the periodic discovery - List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); - assertThat(splits).hasSize(1); - splitPlanner.addSplits(splits); - enumeratorContext.triggerAllActions(); - - assertThat(enumeratorContext.getSplitAssignments()).doesNotContainKey(2); - List pendingSplitIds = - enumerator.snapshotState(1).pendingSplits().stream() - .map(IcebergSourceSplitState::split) - .map(IcebergSourceSplit::splitId) - .collect(Collectors.toList()); - assertThat(pendingSplitIds).hasSameSizeAs(splits).first().isEqualTo(splits.get(0).splitId()); - - // register the reader again, and let it request a split - enumeratorContext.registerReader(2, "localhost"); - enumerator.addReader(2); - enumerator.handleSourceEvent(2, new SplitRequestEvent()); - - assertThat(enumerator.snapshotState(2).pendingSplits()).isEmpty(); - assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) - .contains(splits.get(0)); - } - - @Test - public void testThrottlingDiscovery() throws Exception { - // create 10 splits - List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 10, 1); - - TestingSplitEnumeratorContext enumeratorContext = - new TestingSplitEnumeratorContext<>(4); - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - // discover one snapshot at a time - .maxPlanningSnapshotCount(1) - .build(); - ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 0); - ContinuousIcebergEnumerator enumerator = - createEnumerator(enumeratorContext, scanContext, splitPlanner); - - // register reader-2, and let it request a split - enumeratorContext.registerReader(2, "localhost"); - enumerator.addReader(2); - enumerator.handleSourceEvent(2, new SplitRequestEvent()); - - // add splits[0] to the planner for next discovery - splitPlanner.addSplits(Arrays.asList(splits.get(0))); - enumeratorContext.triggerAllActions(); - - // because discovered split was assigned to reader, pending splits should be empty - assertThat(enumerator.snapshotState(1).pendingSplits()).isEmpty(); - // split assignment to reader-2 should contain splits[0, 1) - assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) - .containsExactlyElementsOf(splits.subList(0, 1)); - - // add the remaining 9 splits (one for every snapshot) - // run discovery cycles while reader-2 still processing the splits[0] - for (int i = 1; i < 10; ++i) { - splitPlanner.addSplits(Arrays.asList(splits.get(i))); - enumeratorContext.triggerAllActions(); - } - - // can only discover up to 3 snapshots/splits - assertThat(enumerator.snapshotState(2).pendingSplits()).hasSize(3); - // split assignment to reader-2 should be splits[0, 1) - assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) - .containsExactlyElementsOf(splits.subList(0, 1)); - - // now reader-2 finished splits[0] - enumerator.handleSourceEvent(2, new SplitRequestEvent(Arrays.asList(splits.get(0).splitId()))); - enumeratorContext.triggerAllActions(); - // still have 3 pending splits. After assigned splits[1] to reader-2, one more split was - // discovered and added. - assertThat(enumerator.snapshotState(3).pendingSplits()).hasSize(3); - // split assignment to reader-2 should be splits[0, 2) - assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) - .containsExactlyElementsOf(splits.subList(0, 2)); - - // run 3 more split discovery cycles - for (int i = 0; i < 3; ++i) { - enumeratorContext.triggerAllActions(); - } - - // no more splits are discovered due to throttling - assertThat(enumerator.snapshotState(4).pendingSplits()).hasSize(3); - // split assignment to reader-2 should still be splits[0, 2) - assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) - .containsExactlyElementsOf(splits.subList(0, 2)); - - // now reader-2 finished splits[1] - enumerator.handleSourceEvent(2, new SplitRequestEvent(Arrays.asList(splits.get(1).splitId()))); - enumeratorContext.triggerAllActions(); - // still have 3 pending splits. After assigned new splits[2] to reader-2, one more split was - // discovered and added. - assertThat(enumerator.snapshotState(5).pendingSplits()).hasSize(3); - // split assignment to reader-2 should be splits[0, 3) - assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) - .containsExactlyElementsOf(splits.subList(0, 3)); - } - - @Test - public void testTransientPlanningErrorsWithSuccessfulRetry() throws Exception { - TestingSplitEnumeratorContext enumeratorContext = - new TestingSplitEnumeratorContext<>(4); - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .maxPlanningSnapshotCount(1) - .maxAllowedPlanningFailures(2) - .build(); - ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 1); - ContinuousIcebergEnumerator enumerator = - createEnumerator(enumeratorContext, scanContext, splitPlanner); - - // Make one split available and trigger the periodic discovery - List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); - splitPlanner.addSplits(splits); - - // Trigger a planning and check that no splits returned due to the planning error - enumeratorContext.triggerAllActions(); - assertThat(enumerator.snapshotState(2).pendingSplits()).isEmpty(); - - // Second scan planning should succeed and discover the expected splits - enumeratorContext.triggerAllActions(); - Collection pendingSplits = enumerator.snapshotState(3).pendingSplits(); - assertThat(pendingSplits).hasSize(1); - IcebergSourceSplitState pendingSplit = pendingSplits.iterator().next(); - assertThat(pendingSplit.split().splitId()).isEqualTo(splits.get(0).splitId()); - assertThat(pendingSplit.status()).isEqualTo(IcebergSourceSplitStatus.UNASSIGNED); - } - - @Test - public void testOverMaxAllowedPlanningErrors() throws Exception { - TestingSplitEnumeratorContext enumeratorContext = - new TestingSplitEnumeratorContext<>(4); - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .maxPlanningSnapshotCount(1) - .maxAllowedPlanningFailures(1) - .build(); - ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 2); - createEnumerator(enumeratorContext, scanContext, splitPlanner); - - // Make one split available and trigger the periodic discovery - List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); - splitPlanner.addSplits(splits); - - // Check that the scheduler response ignores the current error and continues to run until the - // failure limit is reached - enumeratorContext.triggerAllActions(); - assertThat(enumeratorContext.getExecutorService().getAllScheduledTasks().get(0).isDone()) - .isFalse(); - - // Check that the task has failed with the expected exception after the failure limit is reached - enumeratorContext.triggerAllActions(); - assertThat(enumeratorContext.getExecutorService().getAllScheduledTasks().get(0).isDone()) - .isTrue(); - assertThatThrownBy( - () -> enumeratorContext.getExecutorService().getAllScheduledTasks().get(0).get()) - .hasCauseInstanceOf(RuntimeException.class) - .hasMessageContaining("Failed to discover new split"); - } - - @Test - public void testPlanningIgnoringErrors() throws Exception { - int expectedFailures = 3; - TestingSplitEnumeratorContext enumeratorContext = - new TestingSplitEnumeratorContext<>(4); - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .maxPlanningSnapshotCount(1) - .maxAllowedPlanningFailures(-1) - .build(); - ManualContinuousSplitPlanner splitPlanner = - new ManualContinuousSplitPlanner(scanContext, expectedFailures); - ContinuousIcebergEnumerator enumerator = - createEnumerator(enumeratorContext, scanContext, splitPlanner); - - // Make one split available and trigger the periodic discovery - List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); - splitPlanner.addSplits(splits); - - Collection pendingSplits; - // Can not discover the new split with planning failures - for (int i = 0; i < expectedFailures; ++i) { - enumeratorContext.triggerAllActions(); - pendingSplits = enumerator.snapshotState(i).pendingSplits(); - assertThat(pendingSplits).isEmpty(); - } - - // Discovered the new split after a successful scan planning - enumeratorContext.triggerAllActions(); - pendingSplits = enumerator.snapshotState(expectedFailures + 1).pendingSplits(); - assertThat(pendingSplits).hasSize(1); - IcebergSourceSplitState pendingSplit = pendingSplits.iterator().next(); - assertThat(pendingSplit.split().splitId()).isEqualTo(splits.get(0).splitId()); - assertThat(pendingSplit.status()).isEqualTo(IcebergSourceSplitStatus.UNASSIGNED); - } - - private static ContinuousIcebergEnumerator createEnumerator( - SplitEnumeratorContext context, - ScanContext scanContext, - ContinuousSplitPlanner splitPlanner) { - - ContinuousIcebergEnumerator enumerator = - new ContinuousIcebergEnumerator( - context, - new DefaultSplitAssigner(null, Collections.emptyList()), - scanContext, - splitPlanner, - null); - enumerator.start(); - return enumerator; - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java deleted file mode 100644 index 9a4bfa03e28b..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java +++ /dev/null @@ -1,734 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.List; -import java.util.Set; -import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.Collectors; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.HadoopTableExtension; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.ScanContext; -import org.apache.iceberg.flink.source.StreamingStartingStrategy; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.EnumSource; - -public class TestContinuousSplitPlannerImpl { - @TempDir protected Path temporaryFolder; - - private static final FileFormat FILE_FORMAT = FileFormat.PARQUET; - private static final AtomicLong RANDOM_SEED = new AtomicLong(); - - @RegisterExtension - private static final HadoopTableExtension TABLE_RESOURCE = - new HadoopTableExtension(TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); - - private GenericAppenderHelper dataAppender; - private DataFile dataFile1; - private Snapshot snapshot1; - private DataFile dataFile2; - private Snapshot snapshot2; - - @BeforeEach - public void before() throws IOException { - dataAppender = new GenericAppenderHelper(TABLE_RESOURCE.table(), FILE_FORMAT, temporaryFolder); - } - - private void appendTwoSnapshots() throws IOException { - // snapshot1 - List batch1 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); - dataFile1 = dataAppender.writeFile(null, batch1); - dataAppender.appendToTable(dataFile1); - snapshot1 = TABLE_RESOURCE.table().currentSnapshot(); - - // snapshot2 - List batch2 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 1L); - dataFile2 = dataAppender.writeFile(null, batch2); - dataAppender.appendToTable(dataFile2); - snapshot2 = TABLE_RESOURCE.table().currentSnapshot(); - } - - /** - * @return the last enumerated snapshot id - */ - private CycleResult verifyOneCycle( - ContinuousSplitPlannerImpl splitPlanner, IcebergEnumeratorPosition lastPosition) - throws Exception { - List batch = - RandomGenericData.generate(TestFixtures.SCHEMA, 2, RANDOM_SEED.incrementAndGet()); - DataFile dataFile = dataAppender.writeFile(null, batch); - dataAppender.appendToTable(dataFile); - Snapshot snapshot = TABLE_RESOURCE.table().currentSnapshot(); - - ContinuousEnumerationResult result = splitPlanner.planSplits(lastPosition); - assertThat(result.fromPosition().snapshotId()).isEqualTo(lastPosition.snapshotId()); - assertThat(result.fromPosition().snapshotTimestampMs()) - .isEqualTo(lastPosition.snapshotTimestampMs()); - assertThat(result.toPosition().snapshotId().longValue()).isEqualTo(snapshot.snapshotId()); - assertThat(result.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot.timestampMillis()); - assertThat(result.splits()).hasSize(1); - IcebergSourceSplit split = Iterables.getOnlyElement(result.splits()); - assertThat(split.task().files()) - .hasSize(1) - .first() - .satisfies( - fileScanTask -> - assertThat(fileScanTask.file().location()).isEqualTo(dataFile.location())); - return new CycleResult(result.toPosition(), split); - } - - @Test - public void testTableScanThenIncrementalWithEmptyTable() throws Exception { - ScanContext scanContext = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult emptyTableInitialDiscoveryResult = splitPlanner.planSplits(null); - assertThat(emptyTableInitialDiscoveryResult.splits()).isEmpty(); - assertThat(emptyTableInitialDiscoveryResult.fromPosition()).isNull(); - assertThat(emptyTableInitialDiscoveryResult.toPosition().isEmpty()).isTrue(); - assertThat(emptyTableInitialDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); - - ContinuousEnumerationResult emptyTableSecondDiscoveryResult = - splitPlanner.planSplits(emptyTableInitialDiscoveryResult.toPosition()); - assertThat(emptyTableSecondDiscoveryResult.splits()).isEmpty(); - assertThat(emptyTableSecondDiscoveryResult.fromPosition().isEmpty()).isTrue(); - assertThat(emptyTableSecondDiscoveryResult.fromPosition().snapshotTimestampMs()).isNull(); - assertThat(emptyTableSecondDiscoveryResult.toPosition().isEmpty()).isTrue(); - assertThat(emptyTableSecondDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); - - // next 3 snapshots - IcebergEnumeratorPosition lastPosition = emptyTableSecondDiscoveryResult.toPosition(); - for (int i = 0; i < 3; ++i) { - lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; - } - } - - @Test - public void testTableScanThenIncrementalWithNonEmptyTable() throws Exception { - appendTwoSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); - assertThat(initialResult.fromPosition()).isNull(); - assertThat(initialResult.toPosition().snapshotId().longValue()) - .isEqualTo(snapshot2.snapshotId()); - assertThat(initialResult.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot2.timestampMillis()); - assertThat(initialResult.splits()).hasSize(1); - IcebergSourceSplit split = Iterables.getOnlyElement(initialResult.splits()); - assertThat(split.task().files()).hasSize(2); - Set discoveredFiles = - split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().location()) - .collect(Collectors.toSet()); - Set expectedFiles = ImmutableSet.of(dataFile1.location(), dataFile2.location()); - assertThat(discoveredFiles).containsExactlyInAnyOrderElementsOf(expectedFiles); - - IcebergEnumeratorPosition lastPosition = initialResult.toPosition(); - for (int i = 0; i < 3; ++i) { - lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; - } - } - - @ParameterizedTest - @EnumSource( - value = StreamingStartingStrategy.class, - names = {"INCREMENTAL_FROM_LATEST_SNAPSHOT", "INCREMENTAL_FROM_LATEST_SNAPSHOT_EXCLUSIVE"}) - public void testIncrementalFromLatestSnapshotWithEmptyTable( - StreamingStartingStrategy startingStrategy) throws Exception { - ScanContext scanContext = - ScanContext.builder().startingStrategy(startingStrategy).splitSize(1L).build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult emptyTableInitialDiscoveryResult = splitPlanner.planSplits(null); - assertThat(emptyTableInitialDiscoveryResult.splits()).isEmpty(); - assertThat(emptyTableInitialDiscoveryResult.fromPosition()).isNull(); - assertThat(emptyTableInitialDiscoveryResult.toPosition().isEmpty()).isTrue(); - assertThat(emptyTableInitialDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); - - ContinuousEnumerationResult emptyTableSecondDiscoveryResult = - splitPlanner.planSplits(emptyTableInitialDiscoveryResult.toPosition()); - assertThat(emptyTableSecondDiscoveryResult.splits()).isEmpty(); - assertThat(emptyTableSecondDiscoveryResult.fromPosition().isEmpty()).isTrue(); - assertThat(emptyTableSecondDiscoveryResult.fromPosition().snapshotTimestampMs()).isNull(); - assertThat(emptyTableSecondDiscoveryResult.toPosition().isEmpty()).isTrue(); - assertThat(emptyTableSecondDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); - - // latest mode should discover both snapshots, as latest position is marked by when job starts - appendTwoSnapshots(); - ContinuousEnumerationResult afterTwoSnapshotsAppended = - splitPlanner.planSplits(emptyTableSecondDiscoveryResult.toPosition()); - assertThat(afterTwoSnapshotsAppended.splits()).hasSize(2); - - // next 3 snapshots - IcebergEnumeratorPosition lastPosition = afterTwoSnapshotsAppended.toPosition(); - for (int i = 0; i < 3; ++i) { - lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; - } - } - - @Test - public void testIncrementalFromLatestSnapshotWithNonEmptyTable() throws Exception { - appendTwoSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); - assertThat(initialResult.fromPosition()).isNull(); - // For inclusive behavior, the initial result should point to snapshot1 - // Then the next incremental scan shall discover files from latest snapshot2 (inclusive) - assertThat(initialResult.toPosition().snapshotId().longValue()) - .isEqualTo(snapshot1.snapshotId()); - assertThat(initialResult.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot1.timestampMillis()); - assertThat(initialResult.splits()).isEmpty(); - - ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); - assertThat(secondResult.fromPosition().snapshotId().longValue()) - .isEqualTo(snapshot1.snapshotId()); - assertThat(secondResult.fromPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot1.timestampMillis()); - assertThat(secondResult.toPosition().snapshotId().longValue()) - .isEqualTo(snapshot2.snapshotId()); - assertThat(secondResult.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot2.timestampMillis()); - IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); - assertThat(split.task().files()).hasSize(1); - Set discoveredFiles = - split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().location()) - .collect(Collectors.toSet()); - // should discover dataFile2 appended in snapshot2 - Set expectedFiles = ImmutableSet.of(dataFile2.location()); - assertThat(discoveredFiles).containsExactlyElementsOf(expectedFiles); - - IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); - for (int i = 0; i < 3; ++i) { - lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; - } - } - - @Test - public void testIncrementalFromLatestSnapshotExclusiveWithNonEmptyTable() throws Exception { - appendTwoSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT_EXCLUSIVE) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); - assertThat(initialResult.splits()).isEmpty(); - assertThat(initialResult.fromPosition()).isNull(); - // For exclusive behavior, the initial result should point to snapshot2 - assertThat(initialResult.toPosition().snapshotId().longValue()) - .isEqualTo(snapshot2.snapshotId()); - assertThat(initialResult.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot2.timestampMillis()); - - // Then the next incremental scan shall discover no files - ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); - assertThat(initialResult.splits()).isEmpty(); - assertThat(secondResult.fromPosition().snapshotId().longValue()) - .isEqualTo(snapshot2.snapshotId()); - assertThat(secondResult.fromPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot2.timestampMillis()); - assertThat(secondResult.toPosition().snapshotId().longValue()) - .isEqualTo(snapshot2.snapshotId()); - assertThat(secondResult.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot2.timestampMillis()); - - IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); - for (int i = 0; i < 3; ++i) { - lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; - } - } - - @Test - public void testIncrementalFromEarliestSnapshotWithEmptyTable() throws Exception { - ScanContext scanContext = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult emptyTableInitialDiscoveryResult = splitPlanner.planSplits(null); - assertThat(emptyTableInitialDiscoveryResult.splits()).isEmpty(); - assertThat(emptyTableInitialDiscoveryResult.fromPosition()).isNull(); - assertThat(emptyTableInitialDiscoveryResult.toPosition().snapshotId()).isNull(); - assertThat(emptyTableInitialDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); - - ContinuousEnumerationResult emptyTableSecondDiscoveryResult = - splitPlanner.planSplits(emptyTableInitialDiscoveryResult.toPosition()); - assertThat(emptyTableSecondDiscoveryResult.splits()).isEmpty(); - assertThat(emptyTableSecondDiscoveryResult.fromPosition().snapshotId()).isNull(); - assertThat(emptyTableSecondDiscoveryResult.fromPosition().snapshotTimestampMs()).isNull(); - assertThat(emptyTableSecondDiscoveryResult.toPosition().snapshotId()).isNull(); - assertThat(emptyTableSecondDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); - - // next 3 snapshots - IcebergEnumeratorPosition lastPosition = emptyTableSecondDiscoveryResult.toPosition(); - for (int i = 0; i < 3; ++i) { - lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; - } - } - - @Test - public void testIncrementalFromEarliestSnapshotWithNonEmptyTable() throws Exception { - appendTwoSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); - assertThat(initialResult.fromPosition()).isNull(); - // For inclusive behavior, the initial result should point to snapshot1's parent, - // which leads to null snapshotId and snapshotTimestampMs. - assertThat(initialResult.toPosition().snapshotId()).isNull(); - assertThat(initialResult.toPosition().snapshotTimestampMs()).isNull(); - assertThat(initialResult.splits()).isEmpty(); - - ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); - assertThat(secondResult.fromPosition().snapshotId()).isNull(); - assertThat(secondResult.fromPosition().snapshotTimestampMs()).isNull(); - assertThat(secondResult.toPosition().snapshotId().longValue()) - .isEqualTo(snapshot2.snapshotId()); - assertThat(secondResult.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot2.timestampMillis()); - IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); - assertThat(split.task().files()).hasSize(2); - Set discoveredFiles = - split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().location()) - .collect(Collectors.toSet()); - // should discover files appended in both snapshot1 and snapshot2 - Set expectedFiles = ImmutableSet.of(dataFile1.location(), dataFile2.location()); - assertThat(discoveredFiles).containsExactlyInAnyOrderElementsOf(expectedFiles); - - IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); - for (int i = 0; i < 3; ++i) { - lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; - } - } - - @Test - public void testIncrementalFromSnapshotIdWithEmptyTable() { - ScanContext scanContextWithInvalidSnapshotId = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(1L) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl( - TABLE_RESOURCE.tableLoader().clone(), scanContextWithInvalidSnapshotId, null); - - assertThatThrownBy(() -> splitPlanner.planSplits(null)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Start snapshot id not found in history: 1"); - } - - @Test - public void testIncrementalFromSnapshotIdWithInvalidIds() throws Exception { - appendTwoSnapshots(); - - // find an invalid snapshotId - long invalidSnapshotId = 0L; - while (invalidSnapshotId == snapshot1.snapshotId() - || invalidSnapshotId == snapshot2.snapshotId()) { - invalidSnapshotId++; - } - - ScanContext scanContextWithInvalidSnapshotId = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(invalidSnapshotId) - .build(); - - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl( - TABLE_RESOURCE.tableLoader().clone(), scanContextWithInvalidSnapshotId, null); - - assertThatThrownBy(() -> splitPlanner.planSplits(null)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Start snapshot id not found in history: " + invalidSnapshotId); - } - - @Test - public void testIncrementalFromSnapshotId() throws Exception { - appendTwoSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(snapshot2.snapshotId()) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); - assertThat(initialResult.fromPosition()).isNull(); - // For inclusive behavior of snapshot2, the initial result should point to snapshot1 (as - // snapshot2's parent) - assertThat(initialResult.toPosition().snapshotId().longValue()) - .isEqualTo(snapshot1.snapshotId()); - assertThat(initialResult.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot1.timestampMillis()); - assertThat(initialResult.splits()).isEmpty(); - - ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); - assertThat(secondResult.fromPosition().snapshotId().longValue()) - .isEqualTo(snapshot1.snapshotId()); - assertThat(secondResult.fromPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot1.timestampMillis()); - assertThat(secondResult.toPosition().snapshotId().longValue()) - .isEqualTo(snapshot2.snapshotId()); - assertThat(secondResult.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot2.timestampMillis()); - IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); - assertThat(split.task().files()).hasSize(1); - Set discoveredFiles = - split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().location()) - .collect(Collectors.toSet()); - // should discover dataFile2 appended in snapshot2 - Set expectedFiles = ImmutableSet.of(dataFile2.location()); - assertThat(discoveredFiles).containsExactlyElementsOf(expectedFiles); - - IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); - for (int i = 0; i < 3; ++i) { - lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; - } - } - - @Test - public void testIncrementalFromSnapshotTimestampWithEmptyTable() { - ScanContext scanContextWithInvalidSnapshotId = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(1L) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl( - TABLE_RESOURCE.tableLoader().clone(), scanContextWithInvalidSnapshotId, null); - - assertThatThrownBy(() -> splitPlanner.planSplits(null)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot find a snapshot after: 1"); - } - - @Test - public void testIncrementalFromSnapshotTimestampWithInvalidIds() throws Exception { - appendTwoSnapshots(); - - long invalidSnapshotTimestampMs = snapshot2.timestampMillis() + 1000L; - - ScanContext scanContextWithInvalidSnapshotId = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(invalidSnapshotTimestampMs) - .build(); - - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl( - TABLE_RESOURCE.tableLoader().clone(), scanContextWithInvalidSnapshotId, null); - - assertThatThrownBy(() -> splitPlanner.planSplits(null)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageStartingWith("Cannot find a snapshot after:"); - } - - @Test - public void testIncrementalFromSnapshotTimestamp() throws Exception { - appendTwoSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(snapshot2.timestampMillis()) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); - assertThat(initialResult.fromPosition()).isNull(); - // For inclusive behavior, the initial result should point to snapshot1 (as snapshot2's parent). - assertThat(initialResult.toPosition().snapshotId().longValue()) - .isEqualTo(snapshot1.snapshotId()); - assertThat(initialResult.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot1.timestampMillis()); - assertThat(initialResult.splits()).isEmpty(); - - ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); - assertThat(secondResult.fromPosition().snapshotId().longValue()) - .isEqualTo(snapshot1.snapshotId()); - assertThat(secondResult.fromPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot1.timestampMillis()); - assertThat(secondResult.toPosition().snapshotId().longValue()) - .isEqualTo(snapshot2.snapshotId()); - assertThat(secondResult.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot2.timestampMillis()); - IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); - assertThat(split.task().files()).hasSize(1); - Set discoveredFiles = - split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().location()) - .collect(Collectors.toSet()); - // should discover dataFile2 appended in snapshot2 - Set expectedFiles = ImmutableSet.of(dataFile2.location()); - assertThat(discoveredFiles).containsExactlyElementsOf(expectedFiles); - - IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); - for (int i = 0; i < 3; ++i) { - lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; - } - } - - @Test - public void testMaxPlanningSnapshotCount() throws Exception { - appendTwoSnapshots(); - // append 3 more snapshots - for (int i = 2; i < 5; ++i) { - appendSnapshot(i, 2); - } - - ScanContext scanContext = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - // limit to 1 snapshot per discovery - .maxPlanningSnapshotCount(1) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); - assertThat(initialResult.fromPosition()).isNull(); - // For inclusive behavior, the initial result should point to snapshot1's parent, - // which leads to null snapshotId and snapshotTimestampMs. - assertThat(initialResult.toPosition().snapshotId()).isNull(); - assertThat(initialResult.toPosition().snapshotTimestampMs()).isNull(); - assertThat(initialResult.splits()).isEmpty(); - - ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); - // should discover dataFile1 appended in snapshot1 - verifyMaxPlanningSnapshotCountResult( - secondResult, null, snapshot1, ImmutableSet.of(dataFile1.location())); - - ContinuousEnumerationResult thirdResult = splitPlanner.planSplits(secondResult.toPosition()); - // should discover dataFile2 appended in snapshot2 - verifyMaxPlanningSnapshotCountResult( - thirdResult, snapshot1, snapshot2, ImmutableSet.of(dataFile2.location())); - } - - @Test - public void testTableScanNoStats() throws Exception { - appendTwoSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .includeColumnStats(false) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); - assertThat(initialResult.splits()).hasSize(1); - IcebergSourceSplit split = Iterables.getOnlyElement(initialResult.splits()); - assertThat(split.task().files()).hasSize(2); - verifyStatCount(split, 0); - - IcebergEnumeratorPosition lastPosition = initialResult.toPosition(); - for (int i = 0; i < 3; ++i) { - CycleResult result = verifyOneCycle(splitPlanner, lastPosition); - verifyStatCount(result.split, 0); - lastPosition = result.lastPosition; - } - } - - @Test - public void testTableScanAllStats() throws Exception { - appendTwoSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .includeColumnStats(true) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); - assertThat(initialResult.splits()).hasSize(1); - IcebergSourceSplit split = Iterables.getOnlyElement(initialResult.splits()); - assertThat(split.task().files()).hasSize(2); - verifyStatCount(split, 3); - - IcebergEnumeratorPosition lastPosition = initialResult.toPosition(); - for (int i = 0; i < 3; ++i) { - CycleResult result = verifyOneCycle(splitPlanner, lastPosition); - verifyStatCount(result.split, 3); - lastPosition = result.lastPosition; - } - } - - @Test - public void testTableScanSingleStat() throws Exception { - appendTwoSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .includeColumnStats(ImmutableSet.of("data")) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); - assertThat(initialResult.splits()).hasSize(1); - IcebergSourceSplit split = Iterables.getOnlyElement(initialResult.splits()); - assertThat(split.task().files()).hasSize(2); - verifyStatCount(split, 1); - - IcebergEnumeratorPosition lastPosition = initialResult.toPosition(); - for (int i = 0; i < 3; ++i) { - CycleResult result = verifyOneCycle(splitPlanner, lastPosition); - verifyStatCount(result.split, 1); - lastPosition = result.lastPosition; - } - } - - private void verifyStatCount(IcebergSourceSplit split, int expected) { - if (expected == 0) { - split - .task() - .files() - .forEach( - f -> { - assertThat(f.file().valueCounts()).isNull(); - assertThat(f.file().columnSizes()).isNull(); - assertThat(f.file().lowerBounds()).isNull(); - assertThat(f.file().upperBounds()).isNull(); - assertThat(f.file().nanValueCounts()).isNull(); - assertThat(f.file().nullValueCounts()).isNull(); - }); - } else { - split - .task() - .files() - .forEach( - f -> { - assertThat(f.file().valueCounts()).hasSize(expected); - assertThat(f.file().columnSizes()).hasSize(expected); - assertThat(f.file().lowerBounds()).hasSize(expected); - assertThat(f.file().upperBounds()).hasSize(expected); - assertThat(f.file().nullValueCounts()).hasSize(expected); - // The nanValue is not counted for long and string fields - assertThat(f.file().nanValueCounts()).isEmpty(); - }); - } - } - - private void verifyMaxPlanningSnapshotCountResult( - ContinuousEnumerationResult result, - Snapshot fromSnapshotExclusive, - Snapshot toSnapshotInclusive, - Set expectedFiles) { - if (fromSnapshotExclusive == null) { - assertThat(result.fromPosition().snapshotId()).isNull(); - assertThat(result.fromPosition().snapshotTimestampMs()).isNull(); - } else { - assertThat(result.fromPosition().snapshotId().longValue()) - .isEqualTo(fromSnapshotExclusive.snapshotId()); - assertThat(result.fromPosition().snapshotTimestampMs().longValue()) - .isEqualTo(fromSnapshotExclusive.timestampMillis()); - } - assertThat(result.toPosition().snapshotId().longValue()) - .isEqualTo(toSnapshotInclusive.snapshotId()); - assertThat(result.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(toSnapshotInclusive.timestampMillis()); - // should only have one split with one data file, because split discover is limited to - // one snapshot and each snapshot has only one data file appended. - IcebergSourceSplit split = Iterables.getOnlyElement(result.splits()); - assertThat(split.task().files()).hasSize(1); - Set discoveredFiles = - split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().location()) - .collect(Collectors.toSet()); - assertThat(discoveredFiles).containsExactlyElementsOf(expectedFiles); - } - - private Snapshot appendSnapshot(long seed, int numRecords) throws Exception { - List batch = RandomGenericData.generate(TestFixtures.SCHEMA, numRecords, seed); - DataFile dataFile = dataAppender.writeFile(null, batch); - dataAppender.appendToTable(dataFile); - return TABLE_RESOURCE.table().currentSnapshot(); - } - - private static class CycleResult { - IcebergEnumeratorPosition lastPosition; - IcebergSourceSplit split; - - CycleResult(IcebergEnumeratorPosition lastPosition, IcebergSourceSplit split) { - this.lastPosition = lastPosition; - this.split = split; - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java deleted file mode 100644 index 9b59e85d2afb..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.List; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.HadoopTableExtension; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.ScanContext; -import org.apache.iceberg.flink.source.StreamingStartingStrategy; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.EnumSource; - -public class TestContinuousSplitPlannerImplStartStrategy { - private static final FileFormat FILE_FORMAT = FileFormat.PARQUET; - - @TempDir protected Path temporaryFolder; - - @RegisterExtension - private static final HadoopTableExtension TABLE_RESOURCE = - new HadoopTableExtension(TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); - - private GenericAppenderHelper dataAppender; - private Snapshot snapshot1; - private Snapshot snapshot2; - private Snapshot snapshot3; - - @BeforeEach - public void before() throws IOException { - dataAppender = new GenericAppenderHelper(TABLE_RESOURCE.table(), FILE_FORMAT, temporaryFolder); - } - - private void appendThreeSnapshots() throws IOException { - List batch1 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); - dataAppender.appendToTable(batch1); - snapshot1 = TABLE_RESOURCE.table().currentSnapshot(); - - List batch2 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 1L); - dataAppender.appendToTable(batch2); - snapshot2 = TABLE_RESOURCE.table().currentSnapshot(); - - List batch3 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 2L); - dataAppender.appendToTable(batch3); - snapshot3 = TABLE_RESOURCE.table().currentSnapshot(); - } - - @Test - public void testTableScanThenIncrementalStrategy() throws IOException { - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - - assertThat(ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext)) - .isNotPresent(); - - appendThreeSnapshots(); - Snapshot startSnapshot = - ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); - assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot3.snapshotId()); - } - - @ParameterizedTest - @EnumSource( - value = StreamingStartingStrategy.class, - names = {"INCREMENTAL_FROM_LATEST_SNAPSHOT", "INCREMENTAL_FROM_LATEST_SNAPSHOT_EXCLUSIVE"}) - public void testForLatestSnapshotStrategyWithEmptyTable( - StreamingStartingStrategy startingStrategy) throws IOException { - ScanContext scanContext = - ScanContext.builder().streaming(true).startingStrategy(startingStrategy).build(); - - assertThat(ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext)) - .isNotPresent(); - - appendThreeSnapshots(); - Snapshot startSnapshot = - ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); - assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot3.snapshotId()); - } - - @ParameterizedTest - @EnumSource( - value = StreamingStartingStrategy.class, - names = {"INCREMENTAL_FROM_LATEST_SNAPSHOT", "INCREMENTAL_FROM_LATEST_SNAPSHOT_EXCLUSIVE"}) - public void testForLatestSnapshotStrategyWithNonEmptyTable( - StreamingStartingStrategy startingStrategy) throws IOException { - appendThreeSnapshots(); - - ScanContext scanContext = - ScanContext.builder().streaming(true).startingStrategy(startingStrategy).build(); - - Snapshot startSnapshot = - ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); - assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot3.snapshotId()); - } - - @Test - public void testForEarliestSnapshotStrategy() throws IOException { - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .build(); - - assertThat(ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext)) - .isNotPresent(); - - appendThreeSnapshots(); - Snapshot startSnapshot = - ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); - assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot1.snapshotId()); - } - - @Test - public void testForSpecificSnapshotIdStrategy() throws IOException { - ScanContext scanContextInvalidSnapshotId = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(1L) - .build(); - - assertThatThrownBy( - () -> - ContinuousSplitPlannerImpl.startSnapshot( - TABLE_RESOURCE.table(), scanContextInvalidSnapshotId)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Start snapshot id not found in history: 1"); - - appendThreeSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(snapshot2.snapshotId()) - .build(); - - Snapshot startSnapshot = - ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); - assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot2.snapshotId()); - } - - @Test - public void testForSpecificSnapshotTimestampStrategySnapshot2() throws IOException { - ScanContext scanContextInvalidSnapshotTimestamp = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(1L) - .build(); - - assertThatThrownBy( - () -> - ContinuousSplitPlannerImpl.startSnapshot( - TABLE_RESOURCE.table(), scanContextInvalidSnapshotTimestamp)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageStartingWith("Cannot find a snapshot after: "); - - appendThreeSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(snapshot2.timestampMillis()) - .build(); - - Snapshot startSnapshot = - ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); - assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot2.snapshotId()); - } - - @Test - public void testForSpecificSnapshotTimestampStrategySnapshot2Minus1() throws IOException { - appendThreeSnapshots(); - - ScanContext config = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(snapshot2.timestampMillis() - 1L) - .build(); - - Snapshot startSnapshot = - ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), config).get(); - assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot2.snapshotId()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java deleted file mode 100644 index feefcb98646b..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import static org.assertj.core.api.Assertions.assertThat; - -import org.junit.jupiter.api.Test; - -public class TestEnumerationHistory { - private static final int MAX_HISTORY_SIZE = 3; - private static final int FEW_PENDING_SPLITS = 2; - private static final int TOO_MANY_PENDING_SPLITS = 100; - - @Test - public void testEmptyHistory() { - EnumerationHistory history = new EnumerationHistory(MAX_HISTORY_SIZE); - int[] expectedHistorySnapshot = new int[0]; - testHistory(history, expectedHistorySnapshot); - } - - @Test - public void testNotFullHistory() { - EnumerationHistory history = new EnumerationHistory(3); - history.add(1); - history.add(2); - int[] expectedHistorySnapshot = {1, 2}; - testHistory(history, expectedHistorySnapshot); - } - - @Test - public void testExactFullHistory() { - EnumerationHistory history = new EnumerationHistory(3); - history.add(1); - history.add(2); - history.add(3); - int[] expectedHistorySnapshot = {1, 2, 3}; - testHistory(history, expectedHistorySnapshot); - } - - @Test - public void testOneMoreThanFullHistory() { - EnumerationHistory history = new EnumerationHistory(3); - history.add(1); - history.add(2); - history.add(3); - history.add(4); - int[] expectedHistorySnapshot = {2, 3, 4}; - testHistory(history, expectedHistorySnapshot); - } - - @Test - public void testTwoMoreThanFullHistory() { - EnumerationHistory history = new EnumerationHistory(3); - history.add(1); - history.add(2); - history.add(3); - history.add(4); - history.add(5); - int[] expectedHistorySnapshot = {3, 4, 5}; - testHistory(history, expectedHistorySnapshot); - } - - @Test - public void testThreeMoreThanFullHistory() { - EnumerationHistory history = new EnumerationHistory(3); - history.add(1); - history.add(2); - history.add(3); - history.add(4); - history.add(5); - history.add(6); - int[] expectedHistorySnapshot = {4, 5, 6}; - testHistory(history, expectedHistorySnapshot); - } - - private void testHistory(EnumerationHistory history, int[] expectedHistorySnapshot) { - assertThat(history.shouldPauseSplitDiscovery(FEW_PENDING_SPLITS)).isFalse(); - if (history.hasFullHistory()) { - // throttle because pending split count is more than the sum of enumeration history - assertThat(history.shouldPauseSplitDiscovery(TOO_MANY_PENDING_SPLITS)).isTrue(); - } else { - // skipped throttling check because there is not enough history - assertThat(history.shouldPauseSplitDiscovery(TOO_MANY_PENDING_SPLITS)).isFalse(); - } - - int[] historySnapshot = history.snapshot(); - assertThat(historySnapshot).containsExactly(expectedHistorySnapshot); - - EnumerationHistory restoredHistory = new EnumerationHistory(MAX_HISTORY_SIZE); - restoredHistory.restore(historySnapshot); - - assertThat(history.shouldPauseSplitDiscovery(FEW_PENDING_SPLITS)).isFalse(); - if (history.hasFullHistory()) { - // throttle because pending split count is more than the sum of enumeration history - assertThat(history.shouldPauseSplitDiscovery(TOO_MANY_PENDING_SPLITS)).isTrue(); - } else { - // skipped throttling check because there is not enough history - assertThat(history.shouldPauseSplitDiscovery(30)).isFalse(); - } - } - - @Test - public void testRestoreDifferentSize() { - EnumerationHistory history = new EnumerationHistory(3); - history.add(1); - history.add(2); - history.add(3); - int[] historySnapshot = history.snapshot(); - - EnumerationHistory smallerHistory = new EnumerationHistory(2); - smallerHistory.restore(historySnapshot); - int[] expectedRestoredHistorySnapshot = {2, 3}; - assertThat(smallerHistory.snapshot()).containsExactly(expectedRestoredHistorySnapshot); - - EnumerationHistory largerHisotry = new EnumerationHistory(4); - largerHisotry.restore(historySnapshot); - assertThat(largerHisotry.snapshot()).containsExactly(historySnapshot); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java deleted file mode 100644 index 2520a6b763e4..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.Collection; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.flink.source.SplitHelpers; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitStatus; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestIcebergEnumeratorStateSerializer { - @TempDir protected Path temporaryFolder; - - private final IcebergEnumeratorStateSerializer serializer = - new IcebergEnumeratorStateSerializer(true); - - @Parameter(index = 0) - protected int version; - - @Parameters(name = "version={0}") - public static Object[][] parameters() { - return new Object[][] {new Object[] {1}, new Object[] {2}}; - } - - @TestTemplate - public void testEmptySnapshotIdAndPendingSplits() throws Exception { - IcebergEnumeratorState enumeratorState = new IcebergEnumeratorState(Collections.emptyList()); - testSerializer(enumeratorState); - } - - @TestTemplate - public void testSomeSnapshotIdAndEmptyPendingSplits() throws Exception { - IcebergEnumeratorPosition position = - IcebergEnumeratorPosition.of(1L, System.currentTimeMillis()); - - IcebergEnumeratorState enumeratorState = - new IcebergEnumeratorState(position, Collections.emptyList()); - testSerializer(enumeratorState); - } - - @TestTemplate - public void testSomeSnapshotIdAndPendingSplits() throws Exception { - IcebergEnumeratorPosition position = - IcebergEnumeratorPosition.of(2L, System.currentTimeMillis()); - List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 3, 1); - Collection pendingSplits = Lists.newArrayList(); - pendingSplits.add( - new IcebergSourceSplitState(splits.get(0), IcebergSourceSplitStatus.UNASSIGNED)); - pendingSplits.add( - new IcebergSourceSplitState(splits.get(1), IcebergSourceSplitStatus.ASSIGNED)); - pendingSplits.add( - new IcebergSourceSplitState(splits.get(2), IcebergSourceSplitStatus.COMPLETED)); - - IcebergEnumeratorState enumeratorState = new IcebergEnumeratorState(position, pendingSplits); - testSerializer(enumeratorState); - } - - @TestTemplate - public void testEnumerationSplitCountHistory() throws Exception { - if (version == 2) { - IcebergEnumeratorPosition position = - IcebergEnumeratorPosition.of(2L, System.currentTimeMillis()); - List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 3, 1); - Collection pendingSplits = Lists.newArrayList(); - pendingSplits.add( - new IcebergSourceSplitState(splits.get(0), IcebergSourceSplitStatus.UNASSIGNED)); - pendingSplits.add( - new IcebergSourceSplitState(splits.get(1), IcebergSourceSplitStatus.ASSIGNED)); - pendingSplits.add( - new IcebergSourceSplitState(splits.get(2), IcebergSourceSplitStatus.COMPLETED)); - int[] enumerationSplitCountHistory = {1, 2, 3}; - - IcebergEnumeratorState enumeratorState = - new IcebergEnumeratorState(position, pendingSplits, enumerationSplitCountHistory); - testSerializer(enumeratorState); - } - } - - private void testSerializer(IcebergEnumeratorState enumeratorState) throws IOException { - byte[] result; - if (version == 1) { - result = serializer.serializeV1(enumeratorState); - } else { - result = serializer.serialize(enumeratorState); - } - - IcebergEnumeratorState deserialized = serializer.deserialize(version, result); - assertEnumeratorStateEquals(enumeratorState, deserialized); - } - - private void assertEnumeratorStateEquals( - IcebergEnumeratorState expected, IcebergEnumeratorState actual) { - assertThat(actual.lastEnumeratedPosition()).isEqualTo(expected.lastEnumeratedPosition()); - - assertThat(actual.pendingSplits()).hasSameSizeAs(expected.pendingSplits()); - Iterator expectedIterator = expected.pendingSplits().iterator(); - Iterator actualIterator = actual.pendingSplits().iterator(); - for (int i = 0; i < expected.pendingSplits().size(); ++i) { - IcebergSourceSplitState expectedSplitState = expectedIterator.next(); - IcebergSourceSplitState actualSplitState = actualIterator.next(); - assertThat(actualSplitState.split().splitId()) - .isEqualTo(expectedSplitState.split().splitId()); - assertThat(actualSplitState.split().fileOffset()) - .isEqualTo(expectedSplitState.split().fileOffset()); - assertThat(actualSplitState.split().recordOffset()) - .isEqualTo(expectedSplitState.split().recordOffset()); - assertThat(actualSplitState.status()).isEqualTo(expectedSplitState.status()); - } - - assertThat(actual.enumerationSplitCountHistory()) - .containsExactly(expected.enumerationSplitCountHistory()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java deleted file mode 100644 index 0d1d0ce3217c..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java +++ /dev/null @@ -1,218 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.List; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public abstract class ReaderFunctionTestBase { - - @Parameters(name = "fileFormat={0}") - public static Object[][] parameters() { - return new Object[][] { - new Object[] {FileFormat.AVRO}, - new Object[] {FileFormat.ORC}, - new Object[] {FileFormat.PARQUET} - }; - } - - @TempDir protected Path temporaryFolder; - - protected abstract ReaderFunction readerFunction(); - - protected abstract void assertRecords(List expected, List actual, Schema schema); - - @Parameter(index = 0) - private FileFormat fileFormat; - - private final GenericAppenderFactory appenderFactory = - new GenericAppenderFactory(TestFixtures.SCHEMA); - - private void assertRecordsAndPosition( - List expectedRecords, - int expectedFileOffset, - long startRecordOffset, - RecordsWithSplitIds> batch) { - batch.nextSplit(); - List actualRecords = Lists.newArrayList(); - long recordOffset = startRecordOffset; - RecordAndPosition recordAndPosition; - while ((recordAndPosition = batch.nextRecordFromSplit()) != null) { - actualRecords.add(recordAndPosition.record()); - assertThat(recordAndPosition.fileOffset()).isEqualTo(expectedFileOffset); - assertThat(recordAndPosition.recordOffset() - 1).isEqualTo(recordOffset); - recordOffset++; - } - - assertThat(actualRecords).hasSameSizeAs(expectedRecords); - assertRecords(expectedRecords, actualRecords, TestFixtures.SCHEMA); - } - - @TestTemplate - public void testNoCheckpointedPosition() throws IOException { - List> recordBatchList = - ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); - CombinedScanTask combinedScanTask = - ReaderUtil.createCombinedScanTask( - recordBatchList, temporaryFolder, fileFormat, appenderFactory); - IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask); - CloseableIterator>> reader = - readerFunction().apply(split); - - RecordsWithSplitIds> batch0 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(0), 0, 0L, batch0); - batch0.recycle(); - - RecordsWithSplitIds> batch1 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); - batch1.recycle(); - - RecordsWithSplitIds> batch2 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); - batch2.recycle(); - } - - @TestTemplate - public void testCheckpointedPositionBeforeFirstFile() throws IOException { - List> recordBatchList = - ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); - CombinedScanTask combinedScanTask = - ReaderUtil.createCombinedScanTask( - recordBatchList, temporaryFolder, fileFormat, appenderFactory); - IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 0, 0L); - CloseableIterator>> reader = - readerFunction().apply(split); - - RecordsWithSplitIds> batch0 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(0), 0, 0L, batch0); - batch0.recycle(); - - RecordsWithSplitIds> batch1 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); - batch1.recycle(); - - RecordsWithSplitIds> batch2 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); - batch2.recycle(); - } - - @TestTemplate - public void testCheckpointedPositionMiddleFirstFile() throws IOException { - List> recordBatchList = - ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); - CombinedScanTask combinedScanTask = - ReaderUtil.createCombinedScanTask( - recordBatchList, temporaryFolder, fileFormat, appenderFactory); - IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 0, 1L); - CloseableIterator>> reader = - readerFunction().apply(split); - - RecordsWithSplitIds> batch0 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(0).subList(1, 2), 0, 1L, batch0); - batch0.recycle(); - - RecordsWithSplitIds> batch1 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); - batch1.recycle(); - - RecordsWithSplitIds> batch2 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); - batch2.recycle(); - } - - @TestTemplate - public void testCheckpointedPositionAfterFirstFile() throws IOException { - List> recordBatchList = - ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); - CombinedScanTask combinedScanTask = - ReaderUtil.createCombinedScanTask( - recordBatchList, temporaryFolder, fileFormat, appenderFactory); - IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 0, 2L); - CloseableIterator>> reader = - readerFunction().apply(split); - - RecordsWithSplitIds> batch1 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); - batch1.recycle(); - - RecordsWithSplitIds> batch2 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); - batch2.recycle(); - } - - @TestTemplate - public void testCheckpointedPositionBeforeSecondFile() throws IOException { - List> recordBatchList = - ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); - CombinedScanTask combinedScanTask = - ReaderUtil.createCombinedScanTask( - recordBatchList, temporaryFolder, fileFormat, appenderFactory); - IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 1, 0L); - CloseableIterator>> reader = - readerFunction().apply(split); - - RecordsWithSplitIds> batch1 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); - batch1.recycle(); - - RecordsWithSplitIds> batch2 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); - batch2.recycle(); - } - - @TestTemplate - public void testCheckpointedPositionMidSecondFile() throws IOException { - List> recordBatchList = - ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); - CombinedScanTask combinedScanTask = - ReaderUtil.createCombinedScanTask( - recordBatchList, temporaryFolder, fileFormat, appenderFactory); - IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 1, 1L); - CloseableIterator>> reader = - readerFunction().apply(split); - - RecordsWithSplitIds> batch1 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(1).subList(1, 2), 1, 1L, batch1); - batch1.recycle(); - - RecordsWithSplitIds> batch2 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); - batch2.recycle(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java deleted file mode 100644 index 0edf8ae009fe..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; -import java.util.Collections; -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.BaseCombinedScanTask; -import org.apache.iceberg.BaseFileScanTask; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.Files; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.PartitionSpecParser; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SchemaParser; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.encryption.PlaintextEncryptionManager; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.expressions.ResidualEvaluator; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -public class ReaderUtil { - - private ReaderUtil() {} - - public static FileScanTask createFileTask( - List records, - File file, - FileFormat fileFormat, - FileAppenderFactory appenderFactory) - throws IOException { - FileAppender appender = - appenderFactory.newAppender(Files.localOutput(file), fileFormat); - try { - appender.addAll(records); - } finally { - appender.close(); - } - - DataFile dataFile = - DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(records.size()) - .withFileSizeInBytes(file.length()) - .withPath(file.toString()) - .withFormat(fileFormat) - .withMetrics(appender.metrics()) - .build(); - - ResidualEvaluator residuals = ResidualEvaluator.unpartitioned(Expressions.alwaysTrue()); - return new BaseFileScanTask( - dataFile, - null, - SchemaParser.toJson(TestFixtures.SCHEMA), - PartitionSpecParser.toJson(PartitionSpec.unpartitioned()), - residuals); - } - - public static DataIterator createDataIterator(CombinedScanTask combinedTask) { - return new DataIterator<>( - new RowDataFileScanTaskReader( - TestFixtures.SCHEMA, TestFixtures.SCHEMA, null, true, Collections.emptyList()), - combinedTask, - new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), - PlaintextEncryptionManager.instance()); - } - - public static List> createRecordBatchList( - Schema schema, int listSize, int batchCount) { - return createRecordBatchList(0L, schema, listSize, batchCount); - } - - public static List> createRecordBatchList( - long seed, Schema schema, int listSize, int batchCount) { - List records = RandomGenericData.generate(schema, listSize * batchCount, seed); - return Lists.partition(records, batchCount); - } - - public static CombinedScanTask createCombinedScanTask( - List> recordBatchList, - Path temporaryFolder, - FileFormat fileFormat, - GenericAppenderFactory appenderFactory) - throws IOException { - List fileTasks = Lists.newArrayListWithCapacity(recordBatchList.size()); - for (List recordBatch : recordBatchList) { - FileScanTask fileTask = - ReaderUtil.createFileTask( - recordBatch, - File.createTempFile("junit", null, temporaryFolder.toFile()), - fileFormat, - appenderFactory); - fileTasks.add(fileTask); - } - - return new BaseCombinedScanTask(fileTasks); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java deleted file mode 100644 index 6f09bd9a56d6..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.concurrent.atomic.AtomicBoolean; -import org.junit.jupiter.api.Test; - -public class TestArrayBatchRecords { - - @Test - public void testFullRange() { - String[] elements = new String[] {"0", "1", "2", "3"}; - testArray(elements, elements.length, 2, 119); - } - - @Test - public void testSubRange() { - String[] elements = new String[] {"0", "1", "2", "3"}; - testArray(elements, 2, 0, 0); - } - - private void testArray( - String[] elements, int numberOfRecords, int fileOffset, long startingRecordOffset) { - String splitId = "iceberg_split_1"; - AtomicBoolean recycled = new AtomicBoolean(); - - ArrayBatchRecords recordsWithSplitIds = - ArrayBatchRecords.forRecords( - splitId, - ignored -> recycled.set(true), - elements, - numberOfRecords, - fileOffset, - startingRecordOffset); - - assertThat(recordsWithSplitIds.nextSplit()).isEqualTo(splitId); - - for (int i = 0; i < numberOfRecords; i++) { - RecordAndPosition recAndPos = recordsWithSplitIds.nextRecordFromSplit(); - assertThat(recAndPos.record()).isEqualTo(elements[i]); - assertThat(recAndPos.fileOffset()).isEqualTo(fileOffset); - // recordOffset points to the position after this one - assertThat(recAndPos.recordOffset()).isEqualTo(startingRecordOffset + i + 1); - } - - assertThat(recordsWithSplitIds.nextRecordFromSplit()).isNull(); - assertThat(recordsWithSplitIds.nextSplit()).isNull(); - recordsWithSplitIds.recycle(); - assertThat(recycled.get()).isTrue(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java deleted file mode 100644 index 1a78bb1b0010..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java +++ /dev/null @@ -1,360 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.List; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.base.source.reader.SourceReaderOptions; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.BaseCombinedScanTask; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.io.CloseableIterator; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public class TestArrayPoolDataIteratorBatcherRowData { - - @TempDir protected Path temporaryFolder; - private static final FileFormat FILE_FORMAT = FileFormat.PARQUET; - private final Configuration config = - new Configuration() - .set(SourceReaderOptions.ELEMENT_QUEUE_CAPACITY, 1) - .set(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 2); - - private final GenericAppenderFactory appenderFactory = - new GenericAppenderFactory(TestFixtures.SCHEMA); - private final DataIteratorBatcher batcher = - new ArrayPoolDataIteratorBatcher<>(config, new RowDataRecordFactory(TestFixtures.ROW_TYPE)); - - /** Read a CombinedScanTask that contains a single file with less than a full batch of records */ - @Test - public void testSingleFileLessThanOneFullBatch() throws Exception { - List records = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1); - FileScanTask fileTask = - ReaderUtil.createFileTask( - records, - File.createTempFile("junit", null, temporaryFolder.toFile()), - FILE_FORMAT, - appenderFactory); - CombinedScanTask combinedTask = new BaseCombinedScanTask(fileTask); - DataIterator dataIterator = ReaderUtil.createDataIterator(combinedTask); - String splitId = "someSplitId"; - CloseableIterator>> recordBatchIterator = - batcher.batch(splitId, dataIterator); - - ArrayBatchRecords batch = (ArrayBatchRecords) recordBatchIterator.next(); - assertThat(batch.finishedSplits()).isEmpty(); - assertThat(batch.nextSplit()).isEqualTo(splitId); - assertThat(batch.records()).hasSize(2); - assertThat(batch.numberOfRecords()).isEqualTo(1); - - RecordAndPosition recordAndPosition = batch.nextRecordFromSplit(); - - /////////////////////////////// - // assert first record - - assertThat(recordAndPosition.fileOffset()).isEqualTo(0); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(1); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(0), recordAndPosition.record()); - - assertThat(batch.nextRecordFromSplit()).isNull(); - assertThat(batch.nextSplit()).isNull(); - batch.recycle(); - - assertThat(recordBatchIterator).isExhausted(); - } - - /** - * Read a CombinedScanTask that contains a single file with multiple batches. - * - *

    Insert 5 records in a single file that should result in 3 batches - */ - @Test - public void testSingleFileWithMultipleBatches() throws Exception { - List records = RandomGenericData.generate(TestFixtures.SCHEMA, 5, 1); - FileScanTask fileTask = - ReaderUtil.createFileTask( - records, - File.createTempFile("junit", null, temporaryFolder.toFile()), - FILE_FORMAT, - appenderFactory); - CombinedScanTask combinedTask = new BaseCombinedScanTask(fileTask); - DataIterator dataIterator = ReaderUtil.createDataIterator(combinedTask); - String splitId = "someSplitId"; - CloseableIterator>> recordBatchIterator = - batcher.batch(splitId, dataIterator); - - /////////////////////////////// - // assert first batch with full batch of 2 records - - ArrayBatchRecords batch0 = (ArrayBatchRecords) recordBatchIterator.next(); - assertThat(batch0.finishedSplits()).isEmpty(); - assertThat(batch0.nextSplit()).isEqualTo(splitId); - assertThat(batch0.records()).hasSize(2); - assertThat(batch0.numberOfRecords()).isEqualTo(2); - - RecordAndPosition recordAndPosition; - - // assert first record - recordAndPosition = batch0.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(0); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(1); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(0), recordAndPosition.record()); - - // assert second record - recordAndPosition = batch0.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(0); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(2); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(1), recordAndPosition.record()); - - assertThat(batch0.nextRecordFromSplit()).isNull(); - assertThat(batch0.nextSplit()).isNull(); - batch0.recycle(); - - /////////////////////////////// - // assert second batch with full batch of 2 records - - ArrayBatchRecords batch1 = (ArrayBatchRecords) recordBatchIterator.next(); - assertThat(batch1.records()).containsExactlyInAnyOrder(batch0.records()); - assertThat(batch1.finishedSplits()).isEmpty(); - assertThat(batch1.nextSplit()).isEqualTo(splitId); - assertThat(batch1.records()).hasSize(2); - assertThat(batch1.numberOfRecords()).isEqualTo(2); - - // assert third record - recordAndPosition = batch1.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(0); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(3); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(2), recordAndPosition.record()); - - // assert fourth record - recordAndPosition = batch1.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(0); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(4); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(3), recordAndPosition.record()); - - assertThat(batch1.nextRecordFromSplit()).isNull(); - assertThat(batch1.nextSplit()).isNull(); - batch1.recycle(); - - /////////////////////////////// - // assert third batch with partial batch of 1 record - - ArrayBatchRecords batch2 = (ArrayBatchRecords) recordBatchIterator.next(); - assertThat(batch2.records()).containsExactlyInAnyOrder(batch0.records()); - assertThat(batch2.finishedSplits()).isEmpty(); - assertThat(batch2.nextSplit()).isEqualTo(splitId); - assertThat(batch2.records()).hasSize(2); - assertThat(batch2.numberOfRecords()).isEqualTo(1); - - // assert fifth record - recordAndPosition = batch2.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(0); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(5); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(4), recordAndPosition.record()); - - assertThat(batch2.nextRecordFromSplit()).isNull(); - assertThat(batch2.nextSplit()).isNull(); - batch2.recycle(); - - assertThat(recordBatchIterator).isExhausted(); - } - - /** - * Read a CombinedScanTask that contains with multiple files. - * - *

    In this test, we also seek the iterator to starting position (1, 1). - */ - @Test - public void testMultipleFilesWithSeekPosition() throws Exception { - List records0 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1); - FileScanTask fileTask0 = - ReaderUtil.createFileTask( - records0, - File.createTempFile("junit", null, temporaryFolder.toFile()), - FILE_FORMAT, - appenderFactory); - List records1 = RandomGenericData.generate(TestFixtures.SCHEMA, 4, 2); - FileScanTask fileTask1 = - ReaderUtil.createFileTask( - records1, - File.createTempFile("junit", null, temporaryFolder.toFile()), - FILE_FORMAT, - appenderFactory); - List records2 = RandomGenericData.generate(TestFixtures.SCHEMA, 3, 3); - FileScanTask fileTask2 = - ReaderUtil.createFileTask( - records2, - File.createTempFile("junit", null, temporaryFolder.toFile()), - FILE_FORMAT, - appenderFactory); - CombinedScanTask combinedTask = - new BaseCombinedScanTask(Arrays.asList(fileTask0, fileTask1, fileTask2)); - - DataIterator dataIterator = ReaderUtil.createDataIterator(combinedTask); - dataIterator.seek(1, 1); - - String splitId = "someSplitId"; - CloseableIterator>> recordBatchIterator = - batcher.batch(splitId, dataIterator); - - /////////////////////////////// - // file0 is skipped by seek - - /////////////////////////////// - // file1 has 4 records. because the seek position, first record is skipped. - // we should read 3 remaining records in 2 batches: - // batch10 with 2 records and batch11 with 1 records. - - // assert first batch from file1 with full batch of 2 records - - // variable naming convention: batch - ArrayBatchRecords batch10 = (ArrayBatchRecords) recordBatchIterator.next(); - assertThat(batch10.finishedSplits()).isEmpty(); - assertThat(batch10.nextSplit()).isEqualTo(splitId); - assertThat(batch10.records()).hasSize(2); - assertThat(batch10.numberOfRecords()).isEqualTo(2); - - RecordAndPosition recordAndPosition; - - recordAndPosition = batch10.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(1); - assertThat(recordAndPosition.recordOffset()) - .as("seek should skip the first record in file1. starting from the second record") - .isEqualTo(2); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records1.get(1), recordAndPosition.record()); - - recordAndPosition = batch10.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(1); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(3); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records1.get(2), recordAndPosition.record()); - - assertThat(batch10.nextRecordFromSplit()).isNull(); - assertThat(batch10.nextSplit()).isNull(); - batch10.recycle(); - - // assert second batch from file1 with partial batch of 1 record - - // variable naming convention: batch__ - ArrayBatchRecords batch11 = (ArrayBatchRecords) recordBatchIterator.next(); - assertThat(batch11.records()).containsExactlyInAnyOrder(batch10.records()); - assertThat(batch11.finishedSplits()).isEmpty(); - assertThat(batch11.nextSplit()).isEqualTo(splitId); - assertThat(batch11.records()).hasSize(2); - assertThat(batch11.numberOfRecords()).isEqualTo(1); - - recordAndPosition = batch11.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(1); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(4); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records1.get(3), recordAndPosition.record()); - - assertThat(batch11.nextRecordFromSplit()).isNull(); - assertThat(batch11.nextSplit()).isNull(); - batch11.recycle(); - - /////////////////////////////// - // file2 has 3 records. - // we should read 3 records in 2 batches: - // batch20 with 2 records and batch21 with 1 records - - // assert first batch from file2 with full batch of 2 records - - // variable naming convention: batch__ - ArrayBatchRecords batch20 = (ArrayBatchRecords) recordBatchIterator.next(); - assertThat(batch20.records()).containsExactlyInAnyOrder(batch10.records()); - assertThat(batch20.finishedSplits()).isEmpty(); - assertThat(batch20.nextSplit()).isEqualTo(splitId); - assertThat(batch20.records()).hasSize(2); - assertThat(batch20.numberOfRecords()).isEqualTo(2); - - recordAndPosition = batch20.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(2); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(1); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records2.get(0), recordAndPosition.record()); - - recordAndPosition = batch20.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(2); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(2); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records2.get(1), recordAndPosition.record()); - - assertThat(batch20.nextRecordFromSplit()).isNull(); - assertThat(batch20.nextSplit()).isNull(); - batch20.recycle(); - - /////////////////////////////// - // assert second batch from file2 with partial batch of 1 record - - // variable naming convention: batch__ - ArrayBatchRecords batch21 = (ArrayBatchRecords) recordBatchIterator.next(); - assertThat(batch21.records()).containsExactlyInAnyOrder(batch10.records()); - assertThat(batch21.finishedSplits()).isEmpty(); - assertThat(batch21.nextSplit()).isEqualTo(splitId); - assertThat(batch21.records()).hasSize(2); - assertThat(batch21.numberOfRecords()).isEqualTo(1); - - recordAndPosition = batch21.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(2); - - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(3); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records2.get(2), recordAndPosition.record()); - - assertThat(batch21.nextRecordFromSplit()).isNull(); - assertThat(batch21.nextSplit()).isNull(); - batch21.recycle(); - - assertThat(recordBatchIterator).isExhausted(); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java deleted file mode 100644 index af806d4c655d..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import static org.apache.iceberg.flink.TestFixtures.DATABASE; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.io.IOException; -import java.nio.file.Path; -import java.time.LocalDateTime; -import java.time.OffsetDateTime; -import java.time.ZoneOffset; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.concurrent.TimeUnit; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.HadoopTableExtension; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestColumnStatsWatermarkExtractor { - public static final Schema SCHEMA = - new Schema( - required(1, "timestamp_column", Types.TimestampType.withoutZone()), - required(2, "timestamptz_column", Types.TimestampType.withZone()), - required(3, "long_column", Types.LongType.get()), - required(4, "string_column", Types.StringType.get())); - - private static final GenericAppenderFactory APPENDER_FACTORY = new GenericAppenderFactory(SCHEMA); - - private static final List> TEST_RECORDS = - ImmutableList.of( - RandomGenericData.generate(SCHEMA, 3, 2L), RandomGenericData.generate(SCHEMA, 3, 19L)); - - private static final List> MIN_VALUES = - ImmutableList.of(Maps.newHashMapWithExpectedSize(3), Maps.newHashMapWithExpectedSize(3)); - - @TempDir protected Path temporaryFolder; - - @RegisterExtension - private static final HadoopTableExtension SOURCE_TABLE_EXTENSION = - new HadoopTableExtension(DATABASE, TestFixtures.TABLE, SCHEMA); - - @Parameter(index = 0) - private String columnName; - - @BeforeAll - public static void updateMinValue() { - for (int i = 0; i < TEST_RECORDS.size(); ++i) { - for (Record r : TEST_RECORDS.get(i)) { - Map minValues = MIN_VALUES.get(i); - - LocalDateTime localDateTime = (LocalDateTime) r.get(0); - minValues.merge( - "timestamp_column", localDateTime.toInstant(ZoneOffset.UTC).toEpochMilli(), Math::min); - - OffsetDateTime offsetDateTime = (OffsetDateTime) r.get(1); - minValues.merge("timestamptz_column", offsetDateTime.toInstant().toEpochMilli(), Math::min); - - minValues.merge("long_column", (Long) r.get(2), Math::min); - } - } - } - - @Parameters(name = "columnName = {0}") - public static Collection data() { - return ImmutableList.of( - new Object[] {"timestamp_column"}, - new Object[] {"timestamptz_column"}, - new Object[] {"long_column"}); - } - - @TestTemplate - public void testSingle() throws IOException { - ColumnStatsWatermarkExtractor extractor = - new ColumnStatsWatermarkExtractor(SCHEMA, columnName, TimeUnit.MILLISECONDS); - - assertThat(extractor.extractWatermark(split(0))) - .isEqualTo(MIN_VALUES.get(0).get(columnName).longValue()); - } - - @TestTemplate - public void testTimeUnit() throws IOException { - assumeThat(columnName).isEqualTo("long_column"); - ColumnStatsWatermarkExtractor extractor = - new ColumnStatsWatermarkExtractor(SCHEMA, columnName, TimeUnit.MICROSECONDS); - - assertThat(extractor.extractWatermark(split(0))) - .isEqualTo(MIN_VALUES.get(0).get(columnName) / 1000L); - } - - @TestTemplate - public void testMultipleFiles() throws IOException { - assumeThat(columnName).isEqualTo("timestamp_column"); - IcebergSourceSplit combinedSplit = - IcebergSourceSplit.fromCombinedScanTask( - ReaderUtil.createCombinedScanTask( - TEST_RECORDS, temporaryFolder, FileFormat.PARQUET, APPENDER_FACTORY)); - - ColumnStatsWatermarkExtractor extractor = - new ColumnStatsWatermarkExtractor(SCHEMA, columnName, null); - - assertThat(extractor.extractWatermark(split(0))) - .isEqualTo(MIN_VALUES.get(0).get(columnName).longValue()); - assertThat(extractor.extractWatermark(split(1))) - .isEqualTo(MIN_VALUES.get(1).get(columnName).longValue()); - assertThat(extractor.extractWatermark(combinedSplit)) - .isEqualTo(Math.min(MIN_VALUES.get(0).get(columnName), MIN_VALUES.get(1).get(columnName))); - } - - @TestTemplate - public void testWrongColumn() { - assumeThat(columnName).isEqualTo("string_column"); - assertThatThrownBy(() -> new ColumnStatsWatermarkExtractor(SCHEMA, columnName, null)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining( - "Found STRING, expected a LONG or TIMESTAMP column for watermark generation."); - } - - @TestTemplate - public void testEmptyStatistics() throws IOException { - assumeThat(columnName).isEqualTo("timestamp_column"); - - // Create an extractor for a column we do not have statistics - ColumnStatsWatermarkExtractor extractor = - new ColumnStatsWatermarkExtractor(10, "missing_field"); - assertThatThrownBy(() -> extractor.extractWatermark(split(0))) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Missing statistics for column"); - } - - private IcebergSourceSplit split(int id) throws IOException { - return IcebergSourceSplit.fromCombinedScanTask( - ReaderUtil.createCombinedScanTask( - ImmutableList.of(TEST_RECORDS.get(id)), - temporaryFolder, - FileFormat.PARQUET, - APPENDER_FACTORY)); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java deleted file mode 100644 index 8d6782586676..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.nio.file.Path; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.connector.testutils.source.reader.TestingReaderContext; -import org.apache.flink.connector.testutils.source.reader.TestingReaderOutput; -import org.apache.flink.metrics.MetricGroup; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.encryption.PlaintextEncryptionManager; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.SerializableComparator; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public class TestIcebergSourceReader { - @TempDir protected Path temporaryFolder; - - private final GenericAppenderFactory appenderFactory = - new GenericAppenderFactory(TestFixtures.SCHEMA); - - @Test - public void testReaderMetrics() throws Exception { - TestingReaderOutput readerOutput = new TestingReaderOutput<>(); - TestingMetricGroup metricGroup = new TestingMetricGroup(); - TestingReaderContext readerContext = new TestingReaderContext(new Configuration(), metricGroup); - IcebergSourceReader reader = createReader(metricGroup, readerContext, null); - reader.start(); - - testOneSplitFetcher(reader, readerOutput, metricGroup, 1); - testOneSplitFetcher(reader, readerOutput, metricGroup, 2); - } - - @Test - public void testReaderOrder() throws Exception { - // Create 2 splits - List> recordBatchList1 = - ReaderUtil.createRecordBatchList(0L, TestFixtures.SCHEMA, 1, 1); - CombinedScanTask task1 = - ReaderUtil.createCombinedScanTask( - recordBatchList1, temporaryFolder, FileFormat.PARQUET, appenderFactory); - - List> recordBatchList2 = - ReaderUtil.createRecordBatchList(1L, TestFixtures.SCHEMA, 1, 1); - CombinedScanTask task2 = - ReaderUtil.createCombinedScanTask( - recordBatchList2, temporaryFolder, FileFormat.PARQUET, appenderFactory); - - // Sort the splits in one way - List rowDataList1 = - read( - Arrays.asList( - IcebergSourceSplit.fromCombinedScanTask(task1), - IcebergSourceSplit.fromCombinedScanTask(task2)), - 2); - - // Reverse the splits - List rowDataList2 = - read( - Arrays.asList( - IcebergSourceSplit.fromCombinedScanTask(task2), - IcebergSourceSplit.fromCombinedScanTask(task1)), - 2); - - // Check that the order of the elements is not changed - assertThat(rowDataList1).containsExactlyElementsOf(rowDataList2); - } - - private List read(List splits, long expected) throws Exception { - TestingMetricGroup metricGroup = new TestingMetricGroup(); - TestingReaderContext readerContext = new TestingReaderContext(new Configuration(), metricGroup); - // Using IdBasedComparator, so we can have a deterministic order of the splits - IcebergSourceReader reader = createReader(metricGroup, readerContext, new IdBasedComparator()); - reader.start(); - - reader.addSplits(splits); - TestingReaderOutput readerOutput = new TestingReaderOutput<>(); - while (readerOutput.getEmittedRecords().size() < expected) { - reader.pollNext(readerOutput); - } - - reader.pollNext(readerOutput); - - assertThat(readerOutput.getEmittedRecords()).hasSize((int) expected); - return readerOutput.getEmittedRecords(); - } - - private void testOneSplitFetcher( - IcebergSourceReader reader, - TestingReaderOutput readerOutput, - TestingMetricGroup metricGroup, - int expectedCount) - throws Exception { - long seed = expectedCount; - // Each split should contain only one file with one record - List> recordBatchList = - ReaderUtil.createRecordBatchList(seed, TestFixtures.SCHEMA, 1, 1); - CombinedScanTask task = - ReaderUtil.createCombinedScanTask( - recordBatchList, temporaryFolder, FileFormat.PARQUET, appenderFactory); - IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(task); - reader.addSplits(Collections.singletonList(split)); - - while (readerOutput.getEmittedRecords().size() < expectedCount) { - reader.pollNext(readerOutput); - } - - assertThat(readerOutput.getEmittedRecords()).hasSize(expectedCount); - TestHelpers.assertRowData( - TestFixtures.SCHEMA, - recordBatchList.get(0).get(0), - readerOutput.getEmittedRecords().get(expectedCount - 1)); - assertThat(metricGroup.counters().get("assignedSplits").getCount()).isEqualTo(expectedCount); - - // One more poll will get null record batch. - // That will finish the split and cause split fetcher to be closed due to idleness. - // Then next split will create a new split reader. - reader.pollNext(readerOutput); - } - - private IcebergSourceReader createReader( - MetricGroup metricGroup, - SourceReaderContext readerContext, - SerializableComparator splitComparator) { - IcebergSourceReaderMetrics readerMetrics = - new IcebergSourceReaderMetrics(metricGroup, "db.tbl"); - RowDataReaderFunction readerFunction = - new RowDataReaderFunction( - new Configuration(), - TestFixtures.SCHEMA, - TestFixtures.SCHEMA, - null, - true, - new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), - PlaintextEncryptionManager.instance(), - Collections.emptyList()); - return new IcebergSourceReader<>( - SerializableRecordEmitter.defaultEmitter(), - readerMetrics, - readerFunction, - splitComparator, - readerContext); - } - - private static class IdBasedComparator implements SerializableComparator { - @Override - public int compare(IcebergSourceSplit o1, IcebergSourceSplit o2) { - return o1.splitId().compareTo(o2.splitId()); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java deleted file mode 100644 index 36749d3ec2dc..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.nio.file.Path; -import java.util.Collections; -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.encryption.PlaintextEncryptionManager; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.io.TempDir; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.ValueSource; - -public class TestLimitableDataIterator { - @TempDir private static Path temporaryFolder; - - private final RowDataFileScanTaskReader reader = - new RowDataFileScanTaskReader( - TestFixtures.SCHEMA, TestFixtures.SCHEMA, null, true, Collections.emptyList()); - private final HadoopFileIO fileIO = new HadoopFileIO(new org.apache.hadoop.conf.Configuration()); - private final EncryptionManager encryptionManager = PlaintextEncryptionManager.instance(); - - private static CombinedScanTask combinedScanTask; - private static int totalRecords; - - @BeforeAll - public static void beforeClass() throws Exception { - GenericAppenderFactory appenderFactory = new GenericAppenderFactory(TestFixtures.SCHEMA); - List> recordBatchList = - ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); - combinedScanTask = - ReaderUtil.createCombinedScanTask( - recordBatchList, temporaryFolder, FileFormat.PARQUET, appenderFactory); - totalRecords = 3 * 2; - } - - @ParameterizedTest - @ValueSource(longs = {-1L, 0L, 1L, 6L, 7L}) - public void testUnlimited(long limit) { - LimitableDataIterator dataIterator = - new LimitableDataIterator<>( - reader, combinedScanTask, fileIO, encryptionManager, RecordLimiter.create(limit)); - - List result = Lists.newArrayList(); - while (dataIterator.hasNext()) { - result.add(dataIterator.next()); - } - - if (limit <= 0 || limit > totalRecords) { - // read all records - assertThat(result).hasSize(totalRecords); - } else { - assertThat(result).hasSize((int) limit); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java deleted file mode 100644 index 55f9c0af3a29..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.conversion.DataStructureConverter; -import org.apache.flink.table.data.conversion.DataStructureConverters; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.flink.types.Row; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.encryption.PlaintextEncryptionManager; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.hadoop.HadoopFileIO; - -public class TestRowDataReaderFunction extends ReaderFunctionTestBase { - - protected static final RowType ROW_TYPE = FlinkSchemaUtil.convert(TestFixtures.SCHEMA); - private static final DataStructureConverter ROW_DATA_CONVERTER = - DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(ROW_TYPE)); - - @Override - protected ReaderFunction readerFunction() { - return new RowDataReaderFunction( - new Configuration(), - TestFixtures.SCHEMA, - TestFixtures.SCHEMA, - null, - true, - new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), - PlaintextEncryptionManager.instance(), - Collections.emptyList()); - } - - @Override - protected void assertRecords(List expected, List actual, Schema schema) { - List rows = toRows(actual); - TestHelpers.assertRecords(rows, expected, TestFixtures.SCHEMA); - } - - private List toRows(List actual) { - return actual.stream() - .map(rowData -> (Row) ROW_DATA_CONVERTER.toExternal(rowData)) - .collect(Collectors.toList()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java deleted file mode 100644 index 290628c5fc90..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.util.Map; -import org.apache.flink.metrics.Counter; -import org.apache.flink.metrics.Gauge; -import org.apache.flink.metrics.MetricGroup; -import org.apache.flink.metrics.SimpleCounter; -import org.apache.flink.metrics.groups.OperatorIOMetricGroup; -import org.apache.flink.metrics.groups.SourceReaderMetricGroup; -import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -class TestingMetricGroup extends UnregisteredMetricsGroup implements SourceReaderMetricGroup { - private final Map counters; - - TestingMetricGroup() { - this.counters = Maps.newHashMap(); - } - - /** Pass along the reference to share the map for child metric groups. */ - private TestingMetricGroup(Map counters) { - this.counters = counters; - } - - Map counters() { - return counters; - } - - @Override - public Counter counter(String name) { - Counter counter = new SimpleCounter(); - counters.put(name, counter); - return counter; - } - - @Override - public MetricGroup addGroup(String name) { - return new TestingMetricGroup(counters); - } - - @Override - public MetricGroup addGroup(String key, String value) { - return new TestingMetricGroup(counters); - } - - @Override - public OperatorIOMetricGroup getIOMetricGroup() { - return new TestingOperatorIOMetricGroup(); - } - - @Override - public Counter getNumRecordsInErrorsCounter() { - return new SimpleCounter(); - } - - @Override - public void setPendingBytesGauge(Gauge pendingBytesGauge) {} - - @Override - public void setPendingRecordsGauge(Gauge pendingRecordsGauge) {} - - private static class TestingOperatorIOMetricGroup extends UnregisteredMetricsGroup - implements OperatorIOMetricGroup { - @Override - public Counter getNumRecordsInCounter() { - return new SimpleCounter(); - } - - @Override - public Counter getNumRecordsOutCounter() { - return new SimpleCounter(); - } - - @Override - public Counter getNumBytesInCounter() { - return new SimpleCounter(); - } - - @Override - public Counter getNumBytesOutCounter() { - return new SimpleCounter(); - } - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java deleted file mode 100644 index 4a21f451e1e5..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.split; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.nio.file.Path; -import java.util.List; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.stream.Collectors; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.flink.source.SplitHelpers; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public class TestIcebergSourceSplitSerializer { - - @TempDir protected Path temporaryFolder; - - private final IcebergSourceSplitSerializer serializer = new IcebergSourceSplitSerializer(true); - - @Test - public void testLatestVersion() throws Exception { - serializeAndDeserialize(1, 1); - serializeAndDeserialize(10, 2); - } - - private void serializeAndDeserialize(int splitCount, int filesPerSplit) throws Exception { - final List splits = - SplitHelpers.createSplitsFromTransientHadoopTable( - temporaryFolder, splitCount, filesPerSplit); - for (IcebergSourceSplit split : splits) { - byte[] result = serializer.serialize(split); - IcebergSourceSplit deserialized = serializer.deserialize(serializer.getVersion(), result); - assertSplitEquals(split, deserialized); - - byte[] cachedResult = serializer.serialize(split); - assertThat(cachedResult).isSameAs(result); - IcebergSourceSplit deserialized2 = - serializer.deserialize(serializer.getVersion(), cachedResult); - assertSplitEquals(split, deserialized2); - - split.updatePosition(0, 100); - byte[] resultAfterUpdatePosition = serializer.serialize(split); - // after position change, serialized bytes should have changed - assertThat(resultAfterUpdatePosition).isNotSameAs(cachedResult); - IcebergSourceSplit deserialized3 = - serializer.deserialize(serializer.getVersion(), resultAfterUpdatePosition); - assertSplitEquals(split, deserialized3); - } - } - - @Test - public void testV1() throws Exception { - serializeAndDeserializeV1(1, 1); - serializeAndDeserializeV1(10, 2); - } - - private void serializeAndDeserializeV1(int splitCount, int filesPerSplit) throws Exception { - final List splits = - SplitHelpers.createSplitsFromTransientHadoopTable( - temporaryFolder, splitCount, filesPerSplit); - for (IcebergSourceSplit split : splits) { - byte[] result = split.serializeV1(); - IcebergSourceSplit deserialized = IcebergSourceSplit.deserializeV1(result); - assertSplitEquals(split, deserialized); - } - } - - @Test - public void testV2() throws Exception { - serializeAndDeserializeV2(1, 1); - serializeAndDeserializeV2(10, 2); - } - - private void serializeAndDeserializeV2(int splitCount, int filesPerSplit) throws Exception { - final List splits = - SplitHelpers.createSplitsFromTransientHadoopTable( - temporaryFolder, splitCount, filesPerSplit); - for (IcebergSourceSplit split : splits) { - byte[] result = split.serializeV2(); - IcebergSourceSplit deserialized = IcebergSourceSplit.deserializeV2(result, true); - assertSplitEquals(split, deserialized); - } - } - - @Test - public void testV3WithTooManyDeleteFiles() throws Exception { - serializeAndDeserializeV3(1, 1, 5000); - } - - private void serializeAndDeserializeV3(int splitCount, int filesPerSplit, int mockDeletesPerSplit) - throws Exception { - final List splits = - SplitHelpers.createSplitsFromTransientHadoopTable( - temporaryFolder, splitCount, filesPerSplit); - final List splitsWithMockDeleteFiles = - SplitHelpers.equipSplitsWithMockDeleteFiles(splits, temporaryFolder, mockDeletesPerSplit); - - for (IcebergSourceSplit split : splitsWithMockDeleteFiles) { - byte[] result = split.serializeV3(); - IcebergSourceSplit deserialized = IcebergSourceSplit.deserializeV3(result, true); - assertSplitEquals(split, deserialized); - } - } - - @Test - public void testDeserializeV1() throws Exception { - final List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); - for (IcebergSourceSplit split : splits) { - byte[] result = split.serializeV1(); - IcebergSourceSplit deserialized = serializer.deserialize(1, result); - assertSplitEquals(split, deserialized); - } - } - - @Test - public void testCheckpointedPosition() throws Exception { - final AtomicInteger index = new AtomicInteger(); - final List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 10, 2).stream() - .map( - split -> { - IcebergSourceSplit result; - if (index.get() % 2 == 0) { - result = IcebergSourceSplit.fromCombinedScanTask(split.task(), 1, 1); - } else { - result = split; - } - index.incrementAndGet(); - return result; - }) - .collect(Collectors.toList()); - - for (IcebergSourceSplit split : splits) { - byte[] result = serializer.serialize(split); - IcebergSourceSplit deserialized = serializer.deserialize(serializer.getVersion(), result); - assertSplitEquals(split, deserialized); - - byte[] cachedResult = serializer.serialize(split); - assertThat(cachedResult).isSameAs(result); - IcebergSourceSplit deserialized2 = - serializer.deserialize(serializer.getVersion(), cachedResult); - assertSplitEquals(split, deserialized2); - } - } - - private void assertSplitEquals(IcebergSourceSplit expected, IcebergSourceSplit actual) { - List expectedTasks = Lists.newArrayList(expected.task().tasks().iterator()); - List actualTasks = Lists.newArrayList(actual.task().tasks().iterator()); - assertThat(actualTasks).hasSameSizeAs(expectedTasks); - for (int i = 0; i < expectedTasks.size(); ++i) { - FileScanTask expectedTask = expectedTasks.get(i); - FileScanTask actualTask = actualTasks.get(i); - assertThat(actualTask.file().location()).isEqualTo(expectedTask.file().location()); - assertThat(actualTask.sizeBytes()).isEqualTo(expectedTask.sizeBytes()); - assertThat(actualTask.filesCount()).isEqualTo(expectedTask.filesCount()); - assertThat(actualTask.start()).isEqualTo(expectedTask.start()); - assertThat(actualTask.length()).isEqualTo(expectedTask.length()); - } - - assertThat(actual.fileOffset()).isEqualTo(expected.fileOffset()); - assertThat(actual.recordOffset()).isEqualTo(expected.recordOffset()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java deleted file mode 100644 index 11a563709f1a..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.util; - -import static org.assertj.core.api.Assertions.assertThat; - -import org.junit.jupiter.api.Test; -import org.mockito.MockedStatic; -import org.mockito.Mockito; - -public class TestFlinkPackage { - - /** This unit test would need to be adjusted as new Flink version is supported. */ - @Test - public void testVersion() { - assertThat(FlinkPackage.version()).isEqualTo("1.19.2"); - } - - @Test - public void testDefaultVersion() { - // It's difficult to reproduce a reflection error in a unit test, so we just inject a mocked - // fault to test the default logic - - // First make sure we're not caching a version result from a previous test - FlinkPackage.setVersion(null); - try (MockedStatic mockedStatic = Mockito.mockStatic(FlinkPackage.class)) { - mockedStatic.when(FlinkPackage::versionFromJar).thenThrow(RuntimeException.class); - mockedStatic.when(FlinkPackage::version).thenCallRealMethod(); - assertThat(FlinkPackage.version()).isEqualTo(FlinkPackage.FLINK_UNKNOWN_VERSION); - } - FlinkPackage.setVersion(null); - try (MockedStatic mockedStatic = Mockito.mockStatic(FlinkPackage.class)) { - mockedStatic.when(FlinkPackage::versionFromJar).thenReturn(null); - mockedStatic.when(FlinkPackage::version).thenCallRealMethod(); - FlinkPackage.setVersion(null); - assertThat(FlinkPackage.version()).isEqualTo(FlinkPackage.FLINK_UNKNOWN_VERSION); - } - } -} diff --git a/flink/v1.19/flink/src/test/resources/META-INF/services/org.apache.flink.metrics.reporter.MetricReporterFactory b/flink/v1.19/flink/src/test/resources/META-INF/services/org.apache.flink.metrics.reporter.MetricReporterFactory deleted file mode 100644 index 952255a52b7c..000000000000 --- a/flink/v1.19/flink/src/test/resources/META-INF/services/org.apache.flink.metrics.reporter.MetricReporterFactory +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -org.apache.iceberg.flink.maintenance.operator.MetricsReporterFactoryForTests diff --git a/flink/v1.19/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory b/flink/v1.19/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory deleted file mode 100644 index 47a3c94aa991..000000000000 --- a/flink/v1.19/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -org.apache.iceberg.flink.source.BoundedTableFactory diff --git a/gradle.properties b/gradle.properties index 80a00f8cedef..0f70b49eb722 100644 --- a/gradle.properties +++ b/gradle.properties @@ -17,7 +17,7 @@ jmhOutputPath=build/reports/jmh/human-readable-output.txt jmhJsonOutputPath=build/reports/jmh/results.json jmhIncludeRegex=.* systemProp.defaultFlinkVersions=2.1 -systemProp.knownFlinkVersions=1.19,1.20,2.0,2.1 +systemProp.knownFlinkVersions=1.20,2.0,2.1 systemProp.defaultSparkVersions=4.0 systemProp.knownSparkVersions=3.4,3.5,4.0 systemProp.defaultKafkaVersions=3 diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index ea387e04404a..47e90612b7b4 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -46,7 +46,6 @@ esotericsoftware-kryo = "4.0.3" errorprone-annotations = "2.41.0" failsafe = "3.3.2" findbugs-jsr305 = "3.0.2" -flink119 = { strictly = "1.19.2"} flink120 = { strictly = "1.20.1"} flink20 = { strictly = "2.0.0"} flink21 = { strictly = "2.1.0"} @@ -114,12 +113,6 @@ delta-standalone = { module = "io.delta:delta-standalone_2.12", version.ref = "d errorprone-annotations = { module = "com.google.errorprone:error_prone_annotations", version.ref = "errorprone-annotations" } failsafe = { module = "dev.failsafe:failsafe", version.ref = "failsafe"} findbugs-jsr305 = { module = "com.google.code.findbugs:jsr305", version.ref = "findbugs-jsr305" } -flink119-avro = { module = "org.apache.flink:flink-avro", version.ref = "flink119" } -flink119-connector-base = { module = "org.apache.flink:flink-connector-base", version.ref = "flink119" } -flink119-connector-files = { module = "org.apache.flink:flink-connector-files", version.ref = "flink119" } -flink119-metrics-dropwizard = { module = "org.apache.flink:flink-metrics-dropwizard", version.ref = "flink119" } -flink119-streaming-java = { module = "org.apache.flink:flink-streaming-java", version.ref = "flink119" } -flink119-table-api-java-bridge = { module = "org.apache.flink:flink-table-api-java-bridge", version.ref = "flink119" } flink120-avro = { module = "org.apache.flink:flink-avro", version.ref = "flink120" } flink120-connector-base = { module = "org.apache.flink:flink-connector-base", version.ref = "flink120" } flink120-connector-files = { module = "org.apache.flink:flink-connector-files", version.ref = "flink120" } @@ -187,11 +180,6 @@ delta-spark = { module = "io.delta:delta-spark_2.12", version.ref = "delta-spark derby-core = { module = "org.apache.derby:derby", version.ref = "derby"} derby-tools = { module = "org.apache.derby:derbytools", version.ref = "derby"} esotericsoftware-kryo = { module = "com.esotericsoftware:kryo", version.ref = "esotericsoftware-kryo" } -flink119-connector-test-utils = { module = "org.apache.flink:flink-connector-test-utils", version.ref = "flink119" } -flink119-core = { module = "org.apache.flink:flink-core", version.ref = "flink119" } -flink119-runtime = { module = "org.apache.flink:flink-runtime", version.ref = "flink119" } -flink119-test-utils = { module = "org.apache.flink:flink-test-utils", version.ref = "flink119" } -flink119-test-utilsjunit = { module = "org.apache.flink:flink-test-utils-junit", version.ref = "flink119" } flink120-connector-test-utils = { module = "org.apache.flink:flink-connector-test-utils", version.ref = "flink120" } flink120-core = { module = "org.apache.flink:flink-core", version.ref = "flink120" } flink120-runtime = { module = "org.apache.flink:flink-runtime", version.ref = "flink120" } diff --git a/settings.gradle b/settings.gradle index bf84ccee03d1..de342dda1476 100644 --- a/settings.gradle +++ b/settings.gradle @@ -112,15 +112,6 @@ if (!flinkVersions.isEmpty()) { project(':flink').name = 'iceberg-flink' } -if (flinkVersions.contains("1.19")) { - include ":iceberg-flink:flink-1.19" - include ":iceberg-flink:flink-runtime-1.19" - project(":iceberg-flink:flink-1.19").projectDir = file('flink/v1.19/flink') - project(":iceberg-flink:flink-1.19").name = "iceberg-flink-1.19" - project(":iceberg-flink:flink-runtime-1.19").projectDir = file('flink/v1.19/flink-runtime') - project(":iceberg-flink:flink-runtime-1.19").name = "iceberg-flink-runtime-1.19" -} - if (flinkVersions.contains("1.20")) { include ":iceberg-flink:flink-1.20" include ":iceberg-flink:flink-runtime-1.20"